You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by pr...@apache.org on 2014/09/25 13:29:20 UTC
svn commit: r1627519 - in
/pig/branches/spark/src/org/apache/pig/backend/hadoop/executionengine/spark:
SparkLauncher.java converter/SkewedJoinConverter.java
Author: praveen
Date: Thu Sep 25 11:29:19 2014
New Revision: 1627519
URL: http://svn.apache.org/r1627519
Log:
PIG-4191: Make skewed join work (praveen)
Added:
pig/branches/spark/src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SkewedJoinConverter.java (with props)
Modified:
pig/branches/spark/src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLauncher.java
Modified: pig/branches/spark/src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLauncher.java
URL: http://svn.apache.org/viewvc/pig/branches/spark/src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLauncher.java?rev=1627519&r1=1627518&r2=1627519&view=diff
==============================================================================
--- pig/branches/spark/src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLauncher.java (original)
+++ pig/branches/spark/src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLauncher.java Thu Sep 25 11:29:19 2014
@@ -30,6 +30,7 @@ import org.apache.pig.backend.hadoop.exe
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLoad;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLocalRearrange;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POPackage;
+import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POSkewedJoin;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POSort;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POSplit;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStore;
@@ -45,6 +46,7 @@ import org.apache.pig.backend.hadoop.exe
import org.apache.pig.backend.hadoop.executionengine.spark.converter.LocalRearrangeConverter;
import org.apache.pig.backend.hadoop.executionengine.spark.converter.POConverter;
import org.apache.pig.backend.hadoop.executionengine.spark.converter.PackageConverter;
+import org.apache.pig.backend.hadoop.executionengine.spark.converter.SkewedJoinConverter;
import org.apache.pig.backend.hadoop.executionengine.spark.converter.SortConverter;
import org.apache.pig.backend.hadoop.executionengine.spark.converter.SplitConverter;
import org.apache.pig.backend.hadoop.executionengine.spark.converter.StoreConverter;
@@ -132,6 +134,7 @@ public class SparkLauncher extends Launc
convertMap.put(POUnion.class, new UnionConverter(sparkContext));
convertMap.put(POSort.class, new SortConverter());
convertMap.put(POSplit.class, new SplitConverter());
+ convertMap.put(POSkewedJoin.class, new SkewedJoinConverter());
convertMap.put(POCollectedGroup.class, new CollectedGroupConverter());
Map<OperatorKey, RDD<Tuple>> rdds = new HashMap<OperatorKey, RDD<Tuple>>();
Added: pig/branches/spark/src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SkewedJoinConverter.java
URL: http://svn.apache.org/viewvc/pig/branches/spark/src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SkewedJoinConverter.java?rev=1627519&view=auto
==============================================================================
--- pig/branches/spark/src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SkewedJoinConverter.java (added)
+++ pig/branches/spark/src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SkewedJoinConverter.java Thu Sep 25 11:29:19 2014
@@ -0,0 +1,199 @@
+package org.apache.pig.backend.hadoop.executionengine.spark.converter;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.pig.backend.executionengine.ExecException;
+import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator;
+import org.apache.pig.backend.hadoop.executionengine.physicalLayer.Result;
+import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
+import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLocalRearrange;
+import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POSkewedJoin;
+import org.apache.pig.backend.hadoop.executionengine.spark.SparkUtil;
+import org.apache.pig.data.DataType;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+import org.apache.pig.impl.plan.NodeIdGenerator;
+import org.apache.pig.impl.plan.OperatorKey;
+import org.apache.pig.impl.plan.PlanException;
+import org.apache.pig.impl.util.MultiMap;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.function.FlatMapFunction;
+import org.apache.spark.rdd.RDD;
+
+import scala.Tuple2;
+import scala.runtime.AbstractFunction1;
+
+public class SkewedJoinConverter implements
+ POConverter<Tuple, Tuple, POSkewedJoin>, Serializable {
+
+ private POLocalRearrange[] LRs;
+ private POSkewedJoin poSkewedJoin;
+
+ @Override
+ public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
+ POSkewedJoin poSkewedJoin) throws IOException {
+
+ SparkUtil.assertPredecessorSize(predecessors, poSkewedJoin, 2);
+ LRs = new POLocalRearrange[2];
+ this.poSkewedJoin = poSkewedJoin;
+
+ createJoinPlans(poSkewedJoin.getJoinPlans());
+
+ // extract the two RDDs
+ RDD<Tuple> rdd1 = predecessors.get(0);
+ RDD<Tuple> rdd2 = predecessors.get(1);
+
+ // make (key, value) pairs, key has type Object, value has type Tuple
+ RDD<Tuple2<Object, Tuple>> rdd1Pair = rdd1.map(new ExtractKeyFunction(
+ this, 0), SparkUtil.<Object, Tuple>getTuple2Manifest());
+ RDD<Tuple2<Object, Tuple>> rdd2Pair = rdd2.map(new ExtractKeyFunction(
+ this, 1), SparkUtil.<Object, Tuple>getTuple2Manifest());
+
+ // join fn is present in JavaPairRDD class ..
+ JavaPairRDD<Object, Tuple> rdd1Pair_javaRDD = new JavaPairRDD<Object, Tuple>(
+ rdd1Pair, SparkUtil.getManifest(Object.class),
+ SparkUtil.getManifest(Tuple.class));
+ JavaPairRDD<Object, Tuple> rdd2Pair_javaRDD = new JavaPairRDD<Object, Tuple>(
+ rdd2Pair, SparkUtil.getManifest(Object.class),
+ SparkUtil.getManifest(Tuple.class));
+
+ // do the join
+ JavaPairRDD<Object, Tuple2<Tuple, Tuple>> result_KeyValue = rdd1Pair_javaRDD
+ .join(rdd2Pair_javaRDD);
+
+ // map to get RDD<Tuple> from RDD<Object, Tuple2<Tuple, Tuple>> by
+ // ignoring the key (of type Object) and appending the values (the
+ // Tuples)
+ JavaRDD<Tuple> result = result_KeyValue
+ .mapPartitions(new ToValueFunction());
+
+ // return type is RDD<Tuple>, so take it from JavaRDD<Tuple>
+ return result.rdd();
+ }
+
+ private void createJoinPlans(MultiMap<PhysicalOperator, PhysicalPlan> inpPlans) throws PlanException {
+
+ int i = -1;
+ for (PhysicalOperator inpPhyOp : inpPlans.keySet()) {
+ ++i;
+ POLocalRearrange lr = new POLocalRearrange(genKey());
+ try {
+ lr.setIndex(i);
+ } catch (ExecException e) {
+ throw new PlanException(e.getMessage(), e.getErrorCode(), e.getErrorSource(), e);
+ }
+ lr.setResultType(DataType.TUPLE);
+ lr.setKeyType(DataType.TUPLE);//keyTypes.get(i).size() > 1 ? DataType.TUPLE : keyTypes.get(i).get(0));
+ lr.setPlans(inpPlans.get(inpPhyOp));
+ LRs[i] = lr;
+ }
+ }
+
+ private OperatorKey genKey() {
+ return new OperatorKey(poSkewedJoin.getOperatorKey().scope, NodeIdGenerator.getGenerator().getNextNodeId(poSkewedJoin.getOperatorKey().scope));
+ }
+
+ private static class ExtractKeyFunction extends
+ AbstractFunction1<Tuple, Tuple2<Object, Tuple>> implements
+ Serializable {
+
+ private final SkewedJoinConverter poSkewedJoin;
+ private final int LR_index; // 0 for left table, 1 for right table
+
+ public ExtractKeyFunction(SkewedJoinConverter poSkewedJoin, int LR_index) {
+ this.poSkewedJoin = poSkewedJoin;
+ this.LR_index = LR_index;
+ }
+
+ @Override
+ public Tuple2<Object, Tuple> apply(Tuple tuple) {
+
+ // attach tuple to LocalRearrange
+ poSkewedJoin.LRs[LR_index].attachInput(tuple);
+
+ try {
+ // getNextTuple() returns the rearranged tuple
+ Result lrOut = poSkewedJoin.LRs[LR_index].getNextTuple();
+
+ // If tuple is (AA, 5) and key index is $1, then it lrOut is 0 5
+ // (AA), so get(1) returns key
+ Object key = ((Tuple) lrOut.result).get(1);
+
+ Tuple value = tuple;
+
+ // make a (key, value) pair
+ Tuple2<Object, Tuple> tuple_KeyValue = new Tuple2<Object, Tuple>(
+ key, value);
+
+ return tuple_KeyValue;
+
+ } catch (Exception e) {
+ System.out.print(e);
+ return null;
+ }
+ }
+
+ }
+
+ private static class ToValueFunction
+ extends
+ FlatMapFunction<Iterator<Tuple2<Object, Tuple2<Tuple, Tuple>>>, Tuple>
+ implements Serializable {
+
+ private class Tuple2TransformIterable implements Iterable<Tuple> {
+
+ Iterator<Tuple2<Object, Tuple2<Tuple, Tuple>>> in;
+
+ Tuple2TransformIterable(
+ Iterator<Tuple2<Object, Tuple2<Tuple, Tuple>>> input) {
+ in = input;
+ }
+
+ public Iterator<Tuple> iterator() {
+ return new IteratorTransform<Tuple2<Object, Tuple2<Tuple, Tuple>>, Tuple>(
+ in) {
+ @Override
+ protected Tuple transform(
+ Tuple2<Object, Tuple2<Tuple, Tuple>> next) {
+ try {
+
+ Tuple leftTuple = next._2._1;
+ Tuple rightTuple = next._2._2;
+
+ TupleFactory tf = TupleFactory.getInstance();
+ Tuple result = tf.newTuple(leftTuple.size()
+ + rightTuple.size());
+
+ // append the two tuples together to make a
+ // resulting tuple
+ for (int i = 0; i < leftTuple.size(); i++)
+ result.set(i, leftTuple.get(i));
+ for (int i = 0; i < rightTuple.size(); i++)
+ result.set(i + leftTuple.size(),
+ rightTuple.get(i));
+
+ System.out.println("MJC: Result = "
+ + result.toDelimitedString(" "));
+
+ return result;
+
+ } catch (Exception e) {
+ System.out.println(e);
+ }
+ return null;
+ }
+ };
+ }
+ }
+
+ @Override
+ public Iterable<Tuple> call(
+ Iterator<Tuple2<Object, Tuple2<Tuple, Tuple>>> input) {
+ return new Tuple2TransformIterable(input);
+ }
+ }
+}
\ No newline at end of file
Propchange: pig/branches/spark/src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SkewedJoinConverter.java
------------------------------------------------------------------------------
svn:eol-style = native