You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@pig.apache.org by Pallavi Rao <pa...@inmobi.com> on 2016/04/04 07:20:02 UTC
Review Request 45667: Support Pig On Spark
-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
https://reviews.apache.org/r/45667/
-----------------------------------------------------------
Review request for pig, Daniel Dai and Rohini Palaniswamy.
Bugs: PIG-4059 and PIG-4854
https://issues.apache.org/jira/browse/PIG-4059
https://issues.apache.org/jira/browse/PIG-4854
Repository: pig-git
Description
-------
The patch contains all the work done in the spark branch, so far.
Diffs
-----
bin/pig 81f1426
build.xml 8db1a80
ivy.xml dd9878e
ivy/libraries.properties 55d9aed
shims/test/hadoop20/org/apache/pig/test/SparkMiniCluster.java PRE-CREATION
shims/test/hadoop23/org/apache/pig/test/SparkMiniCluster.java PRE-CREATION
src/META-INF/services/org.apache.pig.ExecType 5c034c8
src/docs/src/documentation/content/xdocs/start.xml 36f9952
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/PhysicalOperator.java 1ff1abd
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POUserFunc.java ecf780c
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/plans/PhysicalPlan.java 2376d03
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POCollectedGroup.java bcbfe2b
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POFRJoin.java 894cda7
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POForEach.java 21b75f1
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POGlobalRearrange.java 52cfb73
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POMergeJoin.java 6adfa91
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POSort.java c3a82c3
src/org/apache/pig/backend/hadoop/executionengine/spark/JobGraphBuilder.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/JobMetricsListener.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/KryoSerializer.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/MapReducePartitionerWrapper.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/SparkExecType.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/SparkExecutionEngine.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLauncher.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLocalExecType.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/SparkUtil.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/UDFJarsFinder.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/CollectedGroupConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/CounterConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/DistinctConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/FRJoinConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/FilterConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/ForEachConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/GlobalRearrangeConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/IndexedKey.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/IteratorTransform.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LimitConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LoadConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LocalRearrangeConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/MergeCogroupConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/MergeJoinConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/OutputConsumerIterator.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/PackageConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/PigSecondaryKeyComparatorSpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/RDDConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/RankConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/ReduceByConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SkewedJoinConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SortConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SplitConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/StoreConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/StreamConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/UnionConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/operator/NativeSparkOperator.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/operator/POGlobalRearrangeSpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/operator/POReduceBySpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/AccumulatorOptimizer.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/CombinerOptimizer.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/MultiQueryOptimizerSpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/NoopFilterRemover.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/ParallelismSetter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/SecondaryKeyOptimizerSpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/DotSparkPrinter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkCompiler.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkCompilerException.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOpPlanVisitor.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOperPlan.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOperator.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkPOPackageAnnotator.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkPrinter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/running/PigInputFormatSpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/util/AccumulatorOptimizerUtil.java c4b44ad
src/org/apache/pig/backend/hadoop/executionengine/util/CombinerOptimizerUtil.java 889c01b
src/org/apache/pig/backend/hadoop/executionengine/util/SecondaryKeyOptimizerUtil.java 0b59c9c
src/org/apache/pig/backend/hadoop/hbase/HBaseStorage.java e0581d9
src/org/apache/pig/data/SelfSpillBag.java d17f0a8
src/org/apache/pig/impl/PigContext.java d43949f
src/org/apache/pig/impl/plan/OperatorPlan.java 8b2e2e7
src/org/apache/pig/tools/pigstats/PigStatsUtil.java 542cc2e
src/org/apache/pig/tools/pigstats/spark/SparkCounter.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkCounterGroup.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkCounters.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkJobStats.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkPigStats.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkPigStatusReporter.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkScriptState.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkStatsUtil.java PRE-CREATION
test/e2e/pig/build.xml f7c38ba
test/e2e/pig/conf/spark.conf PRE-CREATION
test/e2e/pig/drivers/TestDriverPig.pm bf9c302
test/e2e/pig/tests/streaming.conf 18f2fb2
test/excluded-tests-spark PRE-CREATION
test/org/apache/pig/newplan/logical/relational/TestLocationInPhysicalPlan.java 94b34b3
test/org/apache/pig/spark/TestIndexedKey.java PRE-CREATION
test/org/apache/pig/spark/TestSecondarySortSpark.java PRE-CREATION
test/org/apache/pig/test/MiniGenericCluster.java 9347269
test/org/apache/pig/test/TestAssert.java 6d4b5c6
test/org/apache/pig/test/TestBuiltin.java 44b4d09
test/org/apache/pig/test/TestCase.java c9bb2fa
test/org/apache/pig/test/TestCollectedGroup.java a958d33
test/org/apache/pig/test/TestCombiner.java df44293
test/org/apache/pig/test/TestCubeOperator.java de96e6c
test/org/apache/pig/test/TestEvalPipeline.java 9efde13
test/org/apache/pig/test/TestEvalPipeline2.java c8f51d7
test/org/apache/pig/test/TestEvalPipelineLocal.java c12d595
test/org/apache/pig/test/TestFinish.java f18c103
test/org/apache/pig/test/TestForEachNestedPlanLocal.java b0aa3a8
test/org/apache/pig/test/TestGrunt.java ef121a3
test/org/apache/pig/test/TestHBaseStorage.java 8d2ad85
test/org/apache/pig/test/TestLimitVariable.java 53b9dae
test/org/apache/pig/test/TestMapSideCogroup.java 2c78b4a
test/org/apache/pig/test/TestMergeJoin.java f1a9608
test/org/apache/pig/test/TestMergeJoinOuter.java 81aee55
test/org/apache/pig/test/TestMultiQuery.java 40684b4
test/org/apache/pig/test/TestMultiQueryLocal.java b9ac035
test/org/apache/pig/test/TestNativeMapReduce.java c4f6573
test/org/apache/pig/test/TestNullConstant.java 3ea4509
test/org/apache/pig/test/TestPigRunner.java fde8609
test/org/apache/pig/test/TestPigServerLocal.java fbabd03
test/org/apache/pig/test/TestProjectRange.java 2e3e7b8
test/org/apache/pig/test/TestPruneColumn.java 3936332
test/org/apache/pig/test/TestRank1.java 9e4ef62
test/org/apache/pig/test/TestRank2.java fc802a9
test/org/apache/pig/test/TestRank3.java 43af10d
test/org/apache/pig/test/TestSecondarySort.java 8991010
test/org/apache/pig/test/TestSkewedJoin.java dba2241
test/org/apache/pig/test/TestStoreBase.java eb3b253
test/org/apache/pig/test/Util.java 8dae247
test/spark-tests PRE-CREATION
Diff: https://reviews.apache.org/r/45667/diff/
Testing
-------
New UTs were added where required and ensure old UTs pass -> https://builds.apache.org/job/Pig-spark/
Thanks,
Pallavi Rao
Re: Review Request 45667: Support Pig On Spark
Posted by kelly zhang <li...@intel.com>.
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > src/org/apache/pig/backend/hadoop/hbase/HBaseStorage.java, line 313
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323847#file1323847line313>
> >
> > Just change the code to use UDFContext.getUDFContext().getJobConf() which should not be null instead of getClientSystemProps(). Not sure why it is using getClientSystemProps() in the first place.
>
> kelly zhang wrote:
> Here if we change to UDFContext.getUDFContext().getJobConf(), problem still exists.
>
>
> The reason why verify UDFContext.getUDFContext().getJobConf() or not is because spark executor first initializes all the object then UDFContext.deserialize is called, HBaseStorage constructor is called before UDFContext.deserialized(), so here we need to verify UDFContext.getUDFContext().getJobConf() is null or not otherwise NPE will be thrown out here.
Update PigOnSpark_3.patch. After PIG-4920, we store UdfContext#getClientSystemProps UDFContext#getUdfConfs into SparkEngineConf. so not modify HBaseStorage any more.
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > src/org/apache/pig/impl/PigContext.java, line 924
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323849#file1323849line924>
> >
> > This can be reverted. PigContext need not be serialized to the backend. See PIG-4866
>
> kelly zhang wrote:
> PIG-4866 is not serialize pigcontext in configuration while here we override PigContext#writeObject and PigContext#readObject to only serialize and deserialize 1 attribute(packageImportList) in spark mode.
Update PigOnSpark_3.patch, After PIG-4920, we store UdfContext#getClientSystemProps UDFContext#getUdfConfs into SparkEngineConf. so not modify PigContext anymore.
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > test/org/apache/pig/test/TestBuiltin.java, line 3255
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323870#file1323870line3255>
> >
> > This testcase is broken if you have 0-0 repeating twice. It is not UniqueID anymore.
>
> kelly zhang wrote:
> 0-0 repeating twice is because we use TaskID in UniqueID#exec:
> public String exec(Tuple input) throws IOException {
> String taskIndex = PigMapReduce.sJobConfInternal.get().get(PigConstants.TASK_INDEX);
> String sequenceId = taskIndex + "-" + Long.toString(sequence);
> sequence++;
> return sequenceId;
> }
> in MR, we initialize PigContants.TASK_INDEX in org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigGenericMapReduce.Reduce#setup
> protected void setup(Context context) throws IOException, InterruptedException {
> ...
> context.getConfiguration().set(PigConstants.TASK_INDEX, Integer.toString(context.getTaskAttemptID().getTaskID().getId()));
> ...
> }
>
> But spark does not provide funtion like PigGenericMapReduce.Reduce#setup to initialize PigContants.TASK_INDEX when job starts.
> Suggest to file a new jira(Initialize PigContants.TASK_INDEX when spark job starts) and skip this unit test until this jira is resolved.
Update PigOnSpark_3.patch. Have created PIG-5051 and added comment on TestBuilt#testUniqueID(the behavior in spark mode will be same with what in mr until PIG-5051 is fixed)
- kelly
-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
https://reviews.apache.org/r/45667/#review134255
-----------------------------------------------------------
On July 11, 2016, 4:32 a.m., Pallavi Rao wrote:
>
> -----------------------------------------------------------
> This is an automatically generated e-mail. To reply, visit:
> https://reviews.apache.org/r/45667/
> -----------------------------------------------------------
>
> (Updated July 11, 2016, 4:32 a.m.)
>
>
> Review request for pig, Daniel Dai and Rohini Palaniswamy.
>
>
> Bugs: PIG-4059 and PIG-4854
> https://issues.apache.org/jira/browse/PIG-4059
> https://issues.apache.org/jira/browse/PIG-4854
>
>
> Repository: pig-git
>
>
> Description
> -------
>
> The patch contains all the work done in the spark branch, so far.
>
>
> Diffs
> -----
>
> bin/pig 81f1426
> build.xml 99ba1f4
> ivy.xml dd9878e
> ivy/libraries.properties 3a819a5
> shims/test/hadoop20/org/apache/pig/test/SparkMiniCluster.java PRE-CREATION
> shims/test/hadoop23/org/apache/pig/test/SparkMiniCluster.java PRE-CREATION
> shims/test/hadoop23/org/apache/pig/test/TezMiniCluster.java 792a1bd
> shims/test/hadoop23/org/apache/pig/test/YarnMiniCluster.java PRE-CREATION
> src/META-INF/services/org.apache.pig.ExecType 5c034c8
> src/docs/src/documentation/content/xdocs/start.xml 36f9952
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/PhysicalOperator.java 1ff1abd
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POUserFunc.java ecf780c
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/plans/PhysicalPlan.java 2376d03
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POCollectedGroup.java bcbfe2b
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POFRJoin.java d80951a
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POForEach.java 21b75f1
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POGlobalRearrange.java 52cfb73
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POMergeJoin.java 13f70c0
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POSort.java c3a82c3
> src/org/apache/pig/backend/hadoop/executionengine/spark/JobGraphBuilder.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/JobMetricsListener.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/KryoSerializer.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/MapReducePartitionerWrapper.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkExecType.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkExecutionEngine.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLauncher.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLocalExecType.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkUtil.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/UDFJarsFinder.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/CollectedGroupConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/CounterConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/DistinctConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/FRJoinConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/FilterConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/ForEachConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/GlobalRearrangeConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/IndexedKey.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/IteratorTransform.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LimitConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LoadConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LocalRearrangeConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/MergeCogroupConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/MergeJoinConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/OutputConsumerIterator.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/PackageConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/PigSecondaryKeyComparatorSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/RDDConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/RankConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/ReduceByConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SkewedJoinConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SortConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SplitConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/StoreConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/StreamConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/UnionConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/operator/NativeSparkOperator.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/operator/POGlobalRearrangeSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/operator/POReduceBySpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/AccumulatorOptimizer.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/CombinerOptimizer.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/MultiQueryOptimizerSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/NoopFilterRemover.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/ParallelismSetter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/SecondaryKeyOptimizerSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/DotSparkPrinter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkCompiler.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkCompilerException.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOpPlanVisitor.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOperPlan.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOperator.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkPOPackageAnnotator.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkPrinter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/running/PigInputFormatSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/util/AccumulatorOptimizerUtil.java c4b44ad
> src/org/apache/pig/backend/hadoop/executionengine/util/CombinerOptimizerUtil.java 889c01b
> src/org/apache/pig/backend/hadoop/executionengine/util/SecondaryKeyOptimizerUtil.java 0b59c9c
> src/org/apache/pig/backend/hadoop/hbase/HBaseStorage.java e0581d9
> src/org/apache/pig/data/SelfSpillBag.java d17f0a8
> src/org/apache/pig/impl/PigContext.java d43949f
> src/org/apache/pig/impl/plan/OperatorPlan.java 8b2e2e7
> src/org/apache/pig/tools/pigstats/PigStatsUtil.java 542cc2e
> src/org/apache/pig/tools/pigstats/spark/SparkCounter.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkCounterGroup.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkCounters.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkJobStats.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkPigStats.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkPigStatusReporter.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkScriptState.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkStatsUtil.java PRE-CREATION
> test/e2e/pig/build.xml f7c38ba
> test/e2e/pig/conf/spark.conf PRE-CREATION
> test/e2e/pig/drivers/TestDriverPig.pm bf9c302
> test/e2e/pig/tests/streaming.conf 18f2fb2
> test/excluded-tests-spark PRE-CREATION
> test/org/apache/pig/newplan/logical/relational/TestLocationInPhysicalPlan.java 94b34b3
> test/org/apache/pig/spark/TestIndexedKey.java PRE-CREATION
> test/org/apache/pig/spark/TestSecondarySortSpark.java PRE-CREATION
> test/org/apache/pig/test/MiniGenericCluster.java 9347269
> test/org/apache/pig/test/TestAssert.java 6d4b5c6
> test/org/apache/pig/test/TestBuiltin.java fbc3f1e
> test/org/apache/pig/test/TestCase.java c9bb2fa
> test/org/apache/pig/test/TestCollectedGroup.java a958d33
> test/org/apache/pig/test/TestCombiner.java df44293
> test/org/apache/pig/test/TestCubeOperator.java de96e6c
> test/org/apache/pig/test/TestEvalPipeline.java 9efde13
> test/org/apache/pig/test/TestEvalPipeline2.java c8f51d7
> test/org/apache/pig/test/TestEvalPipelineLocal.java c12d595
> test/org/apache/pig/test/TestFinish.java f18c103
> test/org/apache/pig/test/TestForEachNestedPlanLocal.java b0aa3a8
> test/org/apache/pig/test/TestGrunt.java 9eaf298
> test/org/apache/pig/test/TestHBaseStorage.java 8d2ad85
> test/org/apache/pig/test/TestLimitVariable.java 53b9dae
> test/org/apache/pig/test/TestMapSideCogroup.java 2c78b4a
> test/org/apache/pig/test/TestMergeJoin.java f1a9608
> test/org/apache/pig/test/TestMergeJoinOuter.java 81aee55
> test/org/apache/pig/test/TestMultiQuery.java c32eab7
> test/org/apache/pig/test/TestMultiQueryLocal.java b9ac035
> test/org/apache/pig/test/TestNativeMapReduce.java c4f6573
> test/org/apache/pig/test/TestNullConstant.java 3ea4509
> test/org/apache/pig/test/TestPigRunner.java fde8609
> test/org/apache/pig/test/TestPigServerLocal.java fbabd03
> test/org/apache/pig/test/TestProjectRange.java 2e3e7b8
> test/org/apache/pig/test/TestPruneColumn.java 3936332
> test/org/apache/pig/test/TestRank1.java 9e4ef62
> test/org/apache/pig/test/TestRank2.java fc802a9
> test/org/apache/pig/test/TestRank3.java 43af10d
> test/org/apache/pig/test/TestSecondarySort.java 8991010
> test/org/apache/pig/test/TestSkewedJoin.java dba2241
> test/org/apache/pig/test/TestStoreBase.java eb3b253
> test/org/apache/pig/test/Util.java 36d01e8
> test/spark-tests PRE-CREATION
>
> Diff: https://reviews.apache.org/r/45667/diff/
>
>
> Testing
> -------
>
> New UTs were added where required and ensure old UTs pass -> https://builds.apache.org/job/Pig-spark/
>
>
> Thanks,
>
> Pallavi Rao
>
>
RE: Review Request 45667: Support Pig On Spark
Posted by "Zhang, Liyun" <li...@intel.com>.
Pallavi create this review board so I have not privilege to upload new patch to this review board, I have sent the new patch to Pallavi and later Pallavi will upload the patch.
-----Original Message-----
From: kelly zhang [mailto:noreply@reviews.apache.org] On Behalf Of kelly zhang
Sent: Tuesday, June 14, 2016 11:25 AM
To: Rohini Palaniswamy; Daniel Dai
Cc: Zhang, Liyun; Pallavi Rao; pig
Subject: Re: Review Request 45667: Support Pig On Spark
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > test/org/apache/pig/test/TestBuiltin.java, line 3255
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323870#file1323870
> > line3255>
> >
> > This testcase is broken if you have 0-0 repeating twice. It is not UniqueID anymore.
0-0 repeating twice is because we use TaskID in UniqueID#exec:
public String exec(Tuple input) throws IOException {
String taskIndex = PigMapReduce.sJobConfInternal.get().get(PigConstants.TASK_INDEX);
String sequenceId = taskIndex + "-" + Long.toString(sequence);
sequence++;
return sequenceId;
}
in MR, we initialize PigContants.TASK_INDEX in org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigGenericMapReduce.Reduce#setup
protected void setup(Context context) throws IOException, InterruptedException {
...
context.getConfiguration().set(PigConstants.TASK_INDEX, Integer.toString(context.getTaskAttemptID().getTaskID().getId()));
...
}
But spark does not provide funtion like PigGenericMapReduce.Reduce#setup to initialize PigContants.TASK_INDEX when job starts.
Suggest to file a new jira(Initialize PigContants.TASK_INDEX when spark job starts) and skip this unit test until this jira is resolved.
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > src/org/apache/pig/data/SelfSpillBag.java, line 32
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323848#file1323848
> > line32>
> >
> > Why is bag even being serialized by Spark?
SelfSpillBag is used in TestHBaseStorage, if not mark it transient, NotSerializableExecption is thrown out
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > test/org/apache/pig/newplan/logical/relational/TestLocationInPhysica
> > lPlan.java, line 66
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323865#file1323865
> > line66>
> >
> > Why does A[3,4] repeat?
The pig script is like:
LOAD '" + Util.encodeEscape(input.getAbsolutePath()) + "' using PigStorage();\n"
"B = GROUP A BY $0;\n"
"A = FOREACH B GENERATE COUNT(A);\n"
"STORE A INTO '" + Util.encodeEscape(output.getAbsolutePath()) + "';");
The spark plan is :
A: Store(/tmp/pig_junit_tmp1755582848/test6087259092054964214output:org.apache.pig.builtin.PigStorage) - scope-9
|
|---A: New For Each(false)[tuple] - scope-13
| |
| Project[bag][1] - scope-11
|
| POUserFunc(org.apache.pig.builtin.COUNT$Final)[long] - scope-12
| |
| |---Project[bag][1] - scope-28
|
|---Reduce By(false,false)[tuple] - scope-18
| |
| Project[bytearray][0] - scope-19
| |
| POUserFunc(org.apache.pig.builtin.COUNT$Intermediate)[tuple] - scope-20
| |
| |---Project[bag][1] - scope-21
|
|---B: Local Rearrange[tuple]{bytearray}(false) - scope-24
| |
| Project[bytearray][0] - scope-26
|
|---A: New For Each(false,false)[bag] - scope-14
| |
| Project[bytearray][0] - scope-15
| |
| POUserFunc(org.apache.pig.builtin.COUNT$Initial)[tuple] - scope-16
| |
| |---Project[bag][1] - scope-17
|
|---Pre Combiner Local Rearrange[tuple]{Unknown} - scope-27
|
|---A: Load(/tmp/pig_junit_tmp1755582848/test7108242581632795697input:PigStorage) - scope-0--------
There are two ForEach (scope-13 and scope-14) in the sparkplan so A[3,4] appears twice.
Comparing with MR plan:
#--------------------------------------------------
# Map Reduce Plan
#--------------------------------------------------
MapReduce node scope-10
Map Plan
B: Local Rearrange[tuple]{bytearray}(false) - scope-22
| |
| Project[bytearray][0] - scope-24
|
|---A: New For Each(false,false)[bag] - scope-11
| |
| Project[bytearray][0] - scope-12
| |
| POUserFunc(org.apache.pig.builtin.COUNT$Initial)[tuple] - scope-13
| |
| |---Project[bag][1] - scope-14
|
|---Pre Combiner Local Rearrange[tuple]{Unknown} - scope-25
|
|---A: Load(/tmp/pig_junit_tmp910232853/test2548400580131197161input:PigStorage) - scope-0-------- Combine Plan
B: Local Rearrange[tuple]{bytearray}(false) - scope-26
| |
| Project[bytearray][0] - scope-28
|
|---A: New For Each(false,false)[bag] - scope-15
| |
| Project[bytearray][0] - scope-16
| |
| POUserFunc(org.apache.pig.builtin.COUNT$Intermediate)[tuple] - scope-17
| |
| |---Project[bag][1] - scope-18
|
|---B: Package(CombinerPackager)[tuple]{bytearray} - scope-21-------- Reduce Plan
A: Store(/tmp/pig_junit_tmp910232853/test9096852332434708302output:org.apache.pig.builtin.PigStorage) - scope-9
|
|---A: New For Each(false)[bag] - scope-8
| |
| POUserFunc(org.apache.pig.builtin.COUNT$Final)[long] - scope-6
| |
| |---Project[bag][1] - scope-19
|
|---B: Package(CombinerPackager)[tuple]{bytearray} - scope-2-------- Global sort: false
----------------
There are two ForEach(scope-8 and scope-11) in MapPlan and Reduce Plan, so A[3,4] appears twice in result(M: A[1,4],A[3,4],B[2,4] C: A[3,4],B[2,4] R: A[3,4] )
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > test/org/apache/pig/test/TestMultiQuery.java, line 116
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323886#file1323886
> > line116>
> >
> > Why are we using checkQueryOutputsAfterSortRecursive in many places when checkQueryOutputsAfterSort would do? It will unnecessarily increase the test execution time. Can they all be changed? Or am I missing something and checkQueryOutputsAfterSort cannot be used for some reason?
The difference between Util.checkQueryOutputsAfterSortRecursive and Util.checkQueryOutputsAfterSort: we can send schema:LogicalSchema to Util.checkQueryOutputsAfterSortRecursive, so function will help change expectedResArray:String[] to expectedRes:ArrayList<Tuple> with proper schema(int,string or other type)
static public void checkQueryOutputsAfterSortRecursive(Iterator<Tuple> actualResultsIt,
String[] expectedResArray, LogicalSchema schema) throws IOException {
...
}
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > ivy.xml, line 451
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323775#file1323775
> > line451>
> >
> > What does 2.10 in spark-core_2.10 and spark-yarn_2.10 signify?
2.10 means the version of scala, this is hard-code when we write the ivy dependency(http://mvnrepository.com/artifact/org.apache.spark).
- kelly
-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
https://reviews.apache.org/r/45667/#review134255
-----------------------------------------------------------
On April 4, 2016, 5:19 a.m., Pallavi Rao wrote:
>
> -----------------------------------------------------------
> This is an automatically generated e-mail. To reply, visit:
> https://reviews.apache.org/r/45667/
> -----------------------------------------------------------
>
> (Updated April 4, 2016, 5:19 a.m.)
>
>
> Review request for pig, Daniel Dai and Rohini Palaniswamy.
>
>
> Bugs: PIG-4059 and PIG-4854
> https://issues.apache.org/jira/browse/PIG-4059
> https://issues.apache.org/jira/browse/PIG-4854
>
>
> Repository: pig-git
>
>
> Description
> -------
>
> The patch contains all the work done in the spark branch, so far.
>
>
> Diffs
> -----
>
> bin/pig 81f1426
> build.xml 8db1a80
> ivy.xml dd9878e
> ivy/libraries.properties 55d9aed
> shims/test/hadoop20/org/apache/pig/test/SparkMiniCluster.java PRE-CREATION
> shims/test/hadoop23/org/apache/pig/test/SparkMiniCluster.java PRE-CREATION
> src/META-INF/services/org.apache.pig.ExecType 5c034c8
> src/docs/src/documentation/content/xdocs/start.xml 36f9952
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/PhysicalOperator.java 1ff1abd
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POUserFunc.java ecf780c
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/plans/PhysicalPlan.java 2376d03
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POCollectedGroup.java bcbfe2b
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POFRJoin.java 894cda7
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POForEach.java 21b75f1
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POGlobalRearrange.java 52cfb73
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POMergeJoin.java 6adfa91
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POSort.java c3a82c3
> src/org/apache/pig/backend/hadoop/executionengine/spark/JobGraphBuilder.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/JobMetricsListener.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/KryoSerializer.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/MapReducePartitionerWrapper.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkExecType.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkExecutionEngine.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLauncher.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLocalExecType.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkUtil.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/UDFJarsFinder.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/CollectedGroupConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/CounterConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/DistinctConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/FRJoinConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/FilterConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/ForEachConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/GlobalRearrangeConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/IndexedKey.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/IteratorTransform.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LimitConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LoadConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LocalRearrangeConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/MergeCogroupConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/MergeJoinConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/OutputConsumerIterator.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/PackageConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/PigSecondaryKeyComparatorSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/RDDConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/RankConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/ReduceByConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SkewedJoinConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SortConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SplitConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/StoreConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/StreamConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/UnionConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/operator/NativeSparkOperator.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/operator/POGlobalRearrangeSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/operator/POReduceBySpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/AccumulatorOptimizer.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/CombinerOptimizer.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/MultiQueryOptimizerSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/NoopFilterRemover.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/ParallelismSetter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/SecondaryKeyOptimizerSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/DotSparkPrinter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkCompiler.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkCompilerException.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOpPlanVisitor.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOperPlan.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOperator.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkPOPackageAnnotator.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkPrinter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/running/PigInputFormatSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/util/AccumulatorOptimizerUtil.java c4b44ad
> src/org/apache/pig/backend/hadoop/executionengine/util/CombinerOptimizerUtil.java 889c01b
> src/org/apache/pig/backend/hadoop/executionengine/util/SecondaryKeyOptimizerUtil.java 0b59c9c
> src/org/apache/pig/backend/hadoop/hbase/HBaseStorage.java e0581d9
> src/org/apache/pig/data/SelfSpillBag.java d17f0a8
> src/org/apache/pig/impl/PigContext.java d43949f
> src/org/apache/pig/impl/plan/OperatorPlan.java 8b2e2e7
> src/org/apache/pig/tools/pigstats/PigStatsUtil.java 542cc2e
> src/org/apache/pig/tools/pigstats/spark/SparkCounter.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkCounterGroup.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkCounters.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkJobStats.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkPigStats.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkPigStatusReporter.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkScriptState.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkStatsUtil.java PRE-CREATION
> test/e2e/pig/build.xml f7c38ba
> test/e2e/pig/conf/spark.conf PRE-CREATION
> test/e2e/pig/drivers/TestDriverPig.pm bf9c302
> test/e2e/pig/tests/streaming.conf 18f2fb2
> test/excluded-tests-spark PRE-CREATION
> test/org/apache/pig/newplan/logical/relational/TestLocationInPhysicalPlan.java 94b34b3
> test/org/apache/pig/spark/TestIndexedKey.java PRE-CREATION
> test/org/apache/pig/spark/TestSecondarySortSpark.java PRE-CREATION
> test/org/apache/pig/test/MiniGenericCluster.java 9347269
> test/org/apache/pig/test/TestAssert.java 6d4b5c6
> test/org/apache/pig/test/TestBuiltin.java 44b4d09
> test/org/apache/pig/test/TestCase.java c9bb2fa
> test/org/apache/pig/test/TestCollectedGroup.java a958d33
> test/org/apache/pig/test/TestCombiner.java df44293
> test/org/apache/pig/test/TestCubeOperator.java de96e6c
> test/org/apache/pig/test/TestEvalPipeline.java 9efde13
> test/org/apache/pig/test/TestEvalPipeline2.java c8f51d7
> test/org/apache/pig/test/TestEvalPipelineLocal.java c12d595
> test/org/apache/pig/test/TestFinish.java f18c103
> test/org/apache/pig/test/TestForEachNestedPlanLocal.java b0aa3a8
> test/org/apache/pig/test/TestGrunt.java ef121a3
> test/org/apache/pig/test/TestHBaseStorage.java 8d2ad85
> test/org/apache/pig/test/TestLimitVariable.java 53b9dae
> test/org/apache/pig/test/TestMapSideCogroup.java 2c78b4a
> test/org/apache/pig/test/TestMergeJoin.java f1a9608
> test/org/apache/pig/test/TestMergeJoinOuter.java 81aee55
> test/org/apache/pig/test/TestMultiQuery.java 40684b4
> test/org/apache/pig/test/TestMultiQueryLocal.java b9ac035
> test/org/apache/pig/test/TestNativeMapReduce.java c4f6573
> test/org/apache/pig/test/TestNullConstant.java 3ea4509
> test/org/apache/pig/test/TestPigRunner.java fde8609
> test/org/apache/pig/test/TestPigServerLocal.java fbabd03
> test/org/apache/pig/test/TestProjectRange.java 2e3e7b8
> test/org/apache/pig/test/TestPruneColumn.java 3936332
> test/org/apache/pig/test/TestRank1.java 9e4ef62
> test/org/apache/pig/test/TestRank2.java fc802a9
> test/org/apache/pig/test/TestRank3.java 43af10d
> test/org/apache/pig/test/TestSecondarySort.java 8991010
> test/org/apache/pig/test/TestSkewedJoin.java dba2241
> test/org/apache/pig/test/TestStoreBase.java eb3b253
> test/org/apache/pig/test/Util.java 8dae247
> test/spark-tests PRE-CREATION
>
> Diff: https://reviews.apache.org/r/45667/diff/
>
>
> Testing
> -------
>
> New UTs were added where required and ensure old UTs pass ->
> https://builds.apache.org/job/Pig-spark/
>
>
> Thanks,
>
> Pallavi Rao
>
>
Re: Review Request 45667: Support Pig On Spark
Posted by kelly zhang <li...@intel.com>.
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > test/org/apache/pig/test/TestBuiltin.java, line 3255
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323870#file1323870line3255>
> >
> > This testcase is broken if you have 0-0 repeating twice. It is not UniqueID anymore.
0-0 repeating twice is because we use TaskID in UniqueID#exec:
public String exec(Tuple input) throws IOException {
String taskIndex = PigMapReduce.sJobConfInternal.get().get(PigConstants.TASK_INDEX);
String sequenceId = taskIndex + "-" + Long.toString(sequence);
sequence++;
return sequenceId;
}
in MR, we initialize PigContants.TASK_INDEX in org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigGenericMapReduce.Reduce#setup
protected void setup(Context context) throws IOException, InterruptedException {
...
context.getConfiguration().set(PigConstants.TASK_INDEX, Integer.toString(context.getTaskAttemptID().getTaskID().getId()));
...
}
But spark does not provide funtion like PigGenericMapReduce.Reduce#setup to initialize PigContants.TASK_INDEX when job starts.
Suggest to file a new jira(Initialize PigContants.TASK_INDEX when spark job starts) and skip this unit test until this jira is resolved.
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > src/org/apache/pig/data/SelfSpillBag.java, line 32
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323848#file1323848line32>
> >
> > Why is bag even being serialized by Spark?
SelfSpillBag is used in TestHBaseStorage, if not mark it transient, NotSerializableExecption is thrown out
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > test/org/apache/pig/newplan/logical/relational/TestLocationInPhysicalPlan.java, line 66
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323865#file1323865line66>
> >
> > Why does A[3,4] repeat?
The pig script is like:
LOAD '" + Util.encodeEscape(input.getAbsolutePath()) + "' using PigStorage();\n"
"B = GROUP A BY $0;\n"
"A = FOREACH B GENERATE COUNT(A);\n"
"STORE A INTO '" + Util.encodeEscape(output.getAbsolutePath()) + "';");
The spark plan is :
A: Store(/tmp/pig_junit_tmp1755582848/test6087259092054964214output:org.apache.pig.builtin.PigStorage) - scope-9
|
|---A: New For Each(false)[tuple] - scope-13
| |
| Project[bag][1] - scope-11
|
| POUserFunc(org.apache.pig.builtin.COUNT$Final)[long] - scope-12
| |
| |---Project[bag][1] - scope-28
|
|---Reduce By(false,false)[tuple] - scope-18
| |
| Project[bytearray][0] - scope-19
| |
| POUserFunc(org.apache.pig.builtin.COUNT$Intermediate)[tuple] - scope-20
| |
| |---Project[bag][1] - scope-21
|
|---B: Local Rearrange[tuple]{bytearray}(false) - scope-24
| |
| Project[bytearray][0] - scope-26
|
|---A: New For Each(false,false)[bag] - scope-14
| |
| Project[bytearray][0] - scope-15
| |
| POUserFunc(org.apache.pig.builtin.COUNT$Initial)[tuple] - scope-16
| |
| |---Project[bag][1] - scope-17
|
|---Pre Combiner Local Rearrange[tuple]{Unknown} - scope-27
|
|---A: Load(/tmp/pig_junit_tmp1755582848/test7108242581632795697input:PigStorage) - scope-0--------
There are two ForEach (scope-13 and scope-14) in the sparkplan so A[3,4] appears twice.
Comparing with MR plan:
#--------------------------------------------------
# Map Reduce Plan
#--------------------------------------------------
MapReduce node scope-10
Map Plan
B: Local Rearrange[tuple]{bytearray}(false) - scope-22
| |
| Project[bytearray][0] - scope-24
|
|---A: New For Each(false,false)[bag] - scope-11
| |
| Project[bytearray][0] - scope-12
| |
| POUserFunc(org.apache.pig.builtin.COUNT$Initial)[tuple] - scope-13
| |
| |---Project[bag][1] - scope-14
|
|---Pre Combiner Local Rearrange[tuple]{Unknown} - scope-25
|
|---A: Load(/tmp/pig_junit_tmp910232853/test2548400580131197161input:PigStorage) - scope-0--------
Combine Plan
B: Local Rearrange[tuple]{bytearray}(false) - scope-26
| |
| Project[bytearray][0] - scope-28
|
|---A: New For Each(false,false)[bag] - scope-15
| |
| Project[bytearray][0] - scope-16
| |
| POUserFunc(org.apache.pig.builtin.COUNT$Intermediate)[tuple] - scope-17
| |
| |---Project[bag][1] - scope-18
|
|---B: Package(CombinerPackager)[tuple]{bytearray} - scope-21--------
Reduce Plan
A: Store(/tmp/pig_junit_tmp910232853/test9096852332434708302output:org.apache.pig.builtin.PigStorage) - scope-9
|
|---A: New For Each(false)[bag] - scope-8
| |
| POUserFunc(org.apache.pig.builtin.COUNT$Final)[long] - scope-6
| |
| |---Project[bag][1] - scope-19
|
|---B: Package(CombinerPackager)[tuple]{bytearray} - scope-2--------
Global sort: false
----------------
There are two ForEach(scope-8 and scope-11) in MapPlan and Reduce Plan, so A[3,4] appears twice in result(M: A[1,4],A[3,4],B[2,4] C: A[3,4],B[2,4] R: A[3,4] )
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > test/org/apache/pig/test/TestMultiQuery.java, line 116
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323886#file1323886line116>
> >
> > Why are we using checkQueryOutputsAfterSortRecursive in many places when checkQueryOutputsAfterSort would do? It will unnecessarily increase the test execution time. Can they all be changed? Or am I missing something and checkQueryOutputsAfterSort cannot be used for some reason?
The difference between Util.checkQueryOutputsAfterSortRecursive and Util.checkQueryOutputsAfterSort: we can send schema:LogicalSchema to Util.checkQueryOutputsAfterSortRecursive, so function will help change expectedResArray:String[] to expectedRes:ArrayList<Tuple> with proper schema(int,string or other type)
static public void checkQueryOutputsAfterSortRecursive(Iterator<Tuple> actualResultsIt,
String[] expectedResArray, LogicalSchema schema) throws IOException {
...
}
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > ivy.xml, line 451
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323775#file1323775line451>
> >
> > What does 2.10 in spark-core_2.10 and spark-yarn_2.10 signify?
2.10 means the version of scala, this is hard-code when we write the ivy dependency(http://mvnrepository.com/artifact/org.apache.spark).
- kelly
-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
https://reviews.apache.org/r/45667/#review134255
-----------------------------------------------------------
On April 4, 2016, 5:19 a.m., Pallavi Rao wrote:
>
> -----------------------------------------------------------
> This is an automatically generated e-mail. To reply, visit:
> https://reviews.apache.org/r/45667/
> -----------------------------------------------------------
>
> (Updated April 4, 2016, 5:19 a.m.)
>
>
> Review request for pig, Daniel Dai and Rohini Palaniswamy.
>
>
> Bugs: PIG-4059 and PIG-4854
> https://issues.apache.org/jira/browse/PIG-4059
> https://issues.apache.org/jira/browse/PIG-4854
>
>
> Repository: pig-git
>
>
> Description
> -------
>
> The patch contains all the work done in the spark branch, so far.
>
>
> Diffs
> -----
>
> bin/pig 81f1426
> build.xml 8db1a80
> ivy.xml dd9878e
> ivy/libraries.properties 55d9aed
> shims/test/hadoop20/org/apache/pig/test/SparkMiniCluster.java PRE-CREATION
> shims/test/hadoop23/org/apache/pig/test/SparkMiniCluster.java PRE-CREATION
> src/META-INF/services/org.apache.pig.ExecType 5c034c8
> src/docs/src/documentation/content/xdocs/start.xml 36f9952
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/PhysicalOperator.java 1ff1abd
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POUserFunc.java ecf780c
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/plans/PhysicalPlan.java 2376d03
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POCollectedGroup.java bcbfe2b
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POFRJoin.java 894cda7
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POForEach.java 21b75f1
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POGlobalRearrange.java 52cfb73
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POMergeJoin.java 6adfa91
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POSort.java c3a82c3
> src/org/apache/pig/backend/hadoop/executionengine/spark/JobGraphBuilder.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/JobMetricsListener.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/KryoSerializer.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/MapReducePartitionerWrapper.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkExecType.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkExecutionEngine.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLauncher.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLocalExecType.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkUtil.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/UDFJarsFinder.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/CollectedGroupConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/CounterConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/DistinctConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/FRJoinConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/FilterConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/ForEachConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/GlobalRearrangeConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/IndexedKey.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/IteratorTransform.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LimitConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LoadConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LocalRearrangeConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/MergeCogroupConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/MergeJoinConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/OutputConsumerIterator.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/PackageConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/PigSecondaryKeyComparatorSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/RDDConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/RankConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/ReduceByConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SkewedJoinConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SortConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SplitConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/StoreConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/StreamConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/UnionConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/operator/NativeSparkOperator.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/operator/POGlobalRearrangeSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/operator/POReduceBySpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/AccumulatorOptimizer.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/CombinerOptimizer.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/MultiQueryOptimizerSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/NoopFilterRemover.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/ParallelismSetter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/SecondaryKeyOptimizerSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/DotSparkPrinter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkCompiler.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkCompilerException.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOpPlanVisitor.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOperPlan.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOperator.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkPOPackageAnnotator.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkPrinter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/running/PigInputFormatSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/util/AccumulatorOptimizerUtil.java c4b44ad
> src/org/apache/pig/backend/hadoop/executionengine/util/CombinerOptimizerUtil.java 889c01b
> src/org/apache/pig/backend/hadoop/executionengine/util/SecondaryKeyOptimizerUtil.java 0b59c9c
> src/org/apache/pig/backend/hadoop/hbase/HBaseStorage.java e0581d9
> src/org/apache/pig/data/SelfSpillBag.java d17f0a8
> src/org/apache/pig/impl/PigContext.java d43949f
> src/org/apache/pig/impl/plan/OperatorPlan.java 8b2e2e7
> src/org/apache/pig/tools/pigstats/PigStatsUtil.java 542cc2e
> src/org/apache/pig/tools/pigstats/spark/SparkCounter.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkCounterGroup.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkCounters.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkJobStats.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkPigStats.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkPigStatusReporter.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkScriptState.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkStatsUtil.java PRE-CREATION
> test/e2e/pig/build.xml f7c38ba
> test/e2e/pig/conf/spark.conf PRE-CREATION
> test/e2e/pig/drivers/TestDriverPig.pm bf9c302
> test/e2e/pig/tests/streaming.conf 18f2fb2
> test/excluded-tests-spark PRE-CREATION
> test/org/apache/pig/newplan/logical/relational/TestLocationInPhysicalPlan.java 94b34b3
> test/org/apache/pig/spark/TestIndexedKey.java PRE-CREATION
> test/org/apache/pig/spark/TestSecondarySortSpark.java PRE-CREATION
> test/org/apache/pig/test/MiniGenericCluster.java 9347269
> test/org/apache/pig/test/TestAssert.java 6d4b5c6
> test/org/apache/pig/test/TestBuiltin.java 44b4d09
> test/org/apache/pig/test/TestCase.java c9bb2fa
> test/org/apache/pig/test/TestCollectedGroup.java a958d33
> test/org/apache/pig/test/TestCombiner.java df44293
> test/org/apache/pig/test/TestCubeOperator.java de96e6c
> test/org/apache/pig/test/TestEvalPipeline.java 9efde13
> test/org/apache/pig/test/TestEvalPipeline2.java c8f51d7
> test/org/apache/pig/test/TestEvalPipelineLocal.java c12d595
> test/org/apache/pig/test/TestFinish.java f18c103
> test/org/apache/pig/test/TestForEachNestedPlanLocal.java b0aa3a8
> test/org/apache/pig/test/TestGrunt.java ef121a3
> test/org/apache/pig/test/TestHBaseStorage.java 8d2ad85
> test/org/apache/pig/test/TestLimitVariable.java 53b9dae
> test/org/apache/pig/test/TestMapSideCogroup.java 2c78b4a
> test/org/apache/pig/test/TestMergeJoin.java f1a9608
> test/org/apache/pig/test/TestMergeJoinOuter.java 81aee55
> test/org/apache/pig/test/TestMultiQuery.java 40684b4
> test/org/apache/pig/test/TestMultiQueryLocal.java b9ac035
> test/org/apache/pig/test/TestNativeMapReduce.java c4f6573
> test/org/apache/pig/test/TestNullConstant.java 3ea4509
> test/org/apache/pig/test/TestPigRunner.java fde8609
> test/org/apache/pig/test/TestPigServerLocal.java fbabd03
> test/org/apache/pig/test/TestProjectRange.java 2e3e7b8
> test/org/apache/pig/test/TestPruneColumn.java 3936332
> test/org/apache/pig/test/TestRank1.java 9e4ef62
> test/org/apache/pig/test/TestRank2.java fc802a9
> test/org/apache/pig/test/TestRank3.java 43af10d
> test/org/apache/pig/test/TestSecondarySort.java 8991010
> test/org/apache/pig/test/TestSkewedJoin.java dba2241
> test/org/apache/pig/test/TestStoreBase.java eb3b253
> test/org/apache/pig/test/Util.java 8dae247
> test/spark-tests PRE-CREATION
>
> Diff: https://reviews.apache.org/r/45667/diff/
>
>
> Testing
> -------
>
> New UTs were added where required and ensure old UTs pass -> https://builds.apache.org/job/Pig-spark/
>
>
> Thanks,
>
> Pallavi Rao
>
>
Re: Review Request 45667: Support Pig On Spark
Posted by kelly zhang <li...@intel.com>.
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > test/org/apache/pig/test/TestCombiner.java, lines 189-191
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323873#file1323873line189>
> >
> > Can be removed with the change to checkCombinerUsed
can not change checkCombinerUsed(pigServer, "c", true) to assertTrue(baos.toString().matches("(?si).*combine plan.*")). If we use original code, this test will fail in spark mode because there is combine plan in spark mode. In spark mode, there is no combine plan but POReduceBySpark implement the combine function.
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > test/org/apache/pig/test/TestCombiner.java, lines 239-241
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323873#file1323873line239>
> >
> > Can be removed with the change to checkCombinerUsed
can not change checkCombinerUsed(pigServer, "c", true) to assertTrue(baos.toString().matches("(?si).*combine plan.*")). If we use original code, this test will fail in spark mode because there is combine plan in spark mode. In spark mode, there is no combine plan but POReduceBySpark implement the combine function.
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > src/org/apache/pig/backend/hadoop/hbase/HBaseStorage.java, line 313
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323847#file1323847line313>
> >
> > Just change the code to use UDFContext.getUDFContext().getJobConf() which should not be null instead of getClientSystemProps(). Not sure why it is using getClientSystemProps() in the first place.
Here if we change to UDFContext.getUDFContext().getJobConf(), problem still exists.
The reason why verify UDFContext.getUDFContext().getJobConf() or not is because spark executor first initializes all the object then UDFContext.deserialize is called, HBaseStorage constructor is called before UDFContext.deserialized(), so here we need to verify UDFContext.getUDFContext().getJobConf() is null or not otherwise NPE will be thrown out here.
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > src/org/apache/pig/backend/hadoop/executionengine/util/AccumulatorOptimizerUtil.java, line 294
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323844#file1323844line294>
> >
> > The utility class is for common code between mapreduce, tez and spark. Please move all Spark specific code to the spark AccumulatorOptimizer
OK, i will move AccumulatorOptimizerUtil#addAccumulatorSpark to spark AccumulatorOptimizer but need make AccumulatorOptimizerUtil#check public because this will be used in spark AccumulatorOptimizer.
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > src/org/apache/pig/impl/PigContext.java, line 924
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323849#file1323849line924>
> >
> > This can be reverted. PigContext need not be serialized to the backend. See PIG-4866
PIG-4866 is not serialize pigcontext in configuration while here we override PigContext#writeObject and PigContext#readObject to only serialize and deserialize 1 attribute(packageImportList) in spark mode.
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > test/org/apache/pig/test/TestPigRunner.java, lines 333-334
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323890#file1323890line333>
> >
> > 1 job for 1 POStore can impact performance. It is not exactly multi-query. Is there a jira to optimize this better and use 1 job?
Yes, 1 job for 1 POStore can impact performance. PIG-4863 will implement it. I have added a TODO in the comment of code:
//TODO: 1 job for 1 POStore can impact performance. PIG-4863 will fix this problem
assertTrue(stats.getJobGraph().size() == 2);
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > test/org/apache/pig/test/TestStoreBase.java, line 155
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323899#file1323899line155>
> >
> > Second store should not succeed.
Yes, the second store should not succeed because of the first one fails.
I will change the comment from
A = load xx;
store A into '1.out' using DummyStore('true','1'); -- first job should fail
store A into '2.out' using DummyStore('false','1'); -- second job should success
After multiQuery optimization the spark plan will be:
Split - scope-14
| |
| a: Store(hdfs://1.out:myudfs.DummyStore('true','1')) - scope-4
| |
| a: Store(hdfs://2.out:myudfs.DummyStore('false','1')) - scope-7
|
|---a: Load(hdfs://zly2.sh.intel.com:8020/user/root/multiStore.txt:org.apache.pig.builtin.PigStorage) - scope-0------
In current code base, once the first job fails, the second job will not be executed.
the FILE_SETUPJOB_CALLED of second job will not exist.
I explain more detail in PIG-4243
to
A = load xx;
store A into '1.out' using DummyStore('true','1'); -- first store fails
store A into '2.out' using DummyStore('false','1'); -- second store will not be executed after first store fails
After multiQuery optimization the spark plan will be:
Split - scope-14
| |
| a: Store(hdfs://1.out:myudfs.DummyStore('true','1')) - scope-4
| |
| a: Store(hdfs://2.out:myudfs.DummyStore('false','1')) - scope-7
|
|---a: Load(hdfs://zly2.sh.intel.com:8020/user/root/multiStore.txt:org.apache.pig.builtin.PigStorage) - scope-0------
In current code base, once the first job fails, the second job will not be executed.
the FILE_SETUPJOB_CALLED of second job will not exist.
I explain more detail in PIG-4243
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > test/org/apache/pig/test/TestStoreBase.java, line 165
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323899#file1323899line165>
> >
> > Setup job file of second one should also be there. Same with setup task as well. They are supposed to happen before putNext of both the stores is called.
In mr mode, one MapReduceOper is related with one job. If the first store in the job fails, second store will not be executed. But
FILE_SETUPJOB_CALLED, FILE_SETUPJOB_CALLED of first and second job are both exists because setupTask() and setupJob() is happened before putNext() of first store.
In curent implementation of spark mode, this SparkOperator related with two stores. One store is related to one job so there are two jobs. Once one job fails the other jobs will not be executed.
the FILE_SETUPJOB_CALLED, FILE_SETUPTASK_CALLED of second job will not exist because setupJob and setupTask of second job is after putNext of first store.
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > test/org/apache/pig/test/TestMapSideCogroup.java, line 340
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323883#file1323883line340>
> >
> > Why is output ordering different with merge join? The assumption with merge join is that files are sorted and join is done based on that. So ordering should be same.
The script of this unit test is like following:
REGISTER myudfs.jar;
A = load './multiSplits1.txt,./multiSplits3.txt' using myudfs.DummyCollectableLoader() as (c1:chararray,c2:int);
B = load './multiSplits2.txt' using myudfs.DummyIndexableLoader() as (c1:chararray,c2:int);
C = cogroup A by c1, B by c1 using 'merge';
store C into './multiSplits.out';
cat bin/multiSplits1.txt ?
1 1
1 2
1 3
2 1
2 2
2 3
3 1
3 2
3 3
cat bin/multiSplits3.txt ?
4 1
4 2
4 3
5 1
5 2
5 3
6 1
6 2
6 3
7 1
7 2
7 3
8 1
8 2
8 3
9 1
9 2
9 3
cat bin/multiSplits2.txt
3 1
3 2
3 3
4 1
4 2
4 3
5 1
5 2
5 3
It is interesting that there are different behavior in "load './multiSplits1.txt,./multiSplits3.txt'" in spark and mr mode.
In mr, it load multiple files according to the size of splits(bigger split go first). it first load multiSplits3.txt then multiSplits1.txt. So the result of the script in mr will be:
4,{(4,1),(4,2),(4,3)},{(4,1),(4,2),(4,3)}
5,{(5,2),(5,1),(5,3)},{(5,1),(5,2),(5,3)}
6,{(6,1),(6,2),(6,3)},{}
7,{(7,1),(7,2),(7,3)},{}
8,{(8,1),(8,2),(8,3)},{}
9,{(9,1),(9,2),(9,3)},{}
1,{(1,1),(1,2),(1,3)},{}
2,{(2,1),(2,2),(2,3)},{}
3,{(3,3),(3,2),(3,1)},{(3,1),(3,2),(3,3)}
In spark, it loads multiple files in sequence. So the result of the script in spark will be:
1,{(1,1),(1,2),(1,3)},{}
2,{(2,1),(2,2),(2,3)},{}
3,{(3,3),(3,2),(3,1)},{(3,1),(3,2),(3,3)}
4,{(4,1),(4,2),(4,3)},{(4,1),(4,2),(4,3)}
5,{(5,2),(5,1),(5,3)},{(5,1),(5,2),(5,3)}
6,{(6,1),(6,2),(6,3)},{}
7,{(7,1),(7,2),(7,3)},{}
8,{(8,1),(8,2),(8,3)},{}
9,{(9,1),(9,2),(9,3)},{}
The reason why there are difference when load multiple splits in mr and spark mode is because org.apache.hadoop.mapreduce.JobSubmitter#writeNewSplits sort the splits by the size(bigger split first) in mr mode.
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > build.xml, line 366
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323774#file1323774line366>
> >
> > Can you confirm that with all the spark changes running with hadoop 1.x (ant test -Dhadoopversion=20) is good?
Pig will drop hadoop 1 support since 0.17(PIG-4923), so should we support hadoop 1.x in pig on spark?
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > src/org/apache/pig/backend/hadoop/executionengine/util/SecondaryKeyOptimizerUtil.java, line 59
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323846#file1323846line59>
> >
> > It is not a good idea to use a static variable to keep state in this static class. Multi-threaded use of Pig will have issues if different modes are used.
> >
> > This utility class is for code common to mapreduce, tez and spark. Please move the spark specific code to spark's SecondaryKeyOptimizer. This is also necessary if we ever complete the mavenization and move tez and spark specific code to separate modules.
OK, i will move specific code to spark's SecondaryKeyOptimizer, but i need make following function static to be reuse in spark's SecondaryKeyOptimizer:
org.apache.pig.backend.hadoop.executionengine.util.SecondaryKeyOptimizerUtil#getSortKeyInfo
org.apache.pig.backend.hadoop.executionengine.util.SecondaryKeyOptimizerUtil#setSecondaryPlan
org.apache.pig.backend.hadoop.executionengine.util.SecondaryKeyOptimizerUtil#collectColumnChain
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > test/org/apache/pig/test/TestEvalPipelineLocal.java, lines 1135-1136
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323877#file1323877line1135>
> >
> > Code should be taking care of this.
Have moved the code of reset UDFContext.getUDFContext().addJonConf(null) to SparkLauncher.java
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > test/org/apache/pig/test/TestMultiQuery.java, line 116
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323886#file1323886line116>
> >
> > Why are we using checkQueryOutputsAfterSortRecursive in many places when checkQueryOutputsAfterSort would do? It will unnecessarily increase the test execution time. Can they all be changed? Or am I missing something and checkQueryOutputsAfterSort cannot be used for some reason?
>
> kelly zhang wrote:
> The difference between Util.checkQueryOutputsAfterSortRecursive and Util.checkQueryOutputsAfterSort: we can send schema:LogicalSchema to Util.checkQueryOutputsAfterSortRecursive, so function will help change expectedResArray:String[] to expectedRes:ArrayList<Tuple> with proper schema(int,string or other type)
>
> static public void checkQueryOutputsAfterSortRecursive(Iterator<Tuple> actualResultsIt,
> String[] expectedResArray, LogicalSchema schema) throws IOException {
> ...
> }
The output of group,join, distinct has difference between mr and spark mode, the output is in order in mr because sort is done in the shuffle process while there is no such operation in spark. So add following
functions in org.apache.pig.test.Util to only sort the output when in mr mode
public static void checkQueryOutputs(Iterator<Tuple> actualResults, List<Tuple> expectedResults, boolean checkAfterSort)
public static void checkQueryOutputs(Iterator<Tuple> actualResults, String[] expectedResults,LogicalSchema logicalSchema, boolean checkAfterSort) throws IOException
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > test/org/apache/pig/test/TestPigRunner.java, line 1155
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323890#file1323890line1155>
> >
> > Is there a jira to fix the number of records?
In spark mode, if pig.disable.counter = true, the number of records of the input are not calculated. I have add some comments to explain this.
> On May 22, 2016, 9:57 p.m., Rohini Palaniswamy wrote:
> > test/org/apache/pig/test/TestSecondarySort.java, line 520
> > <https://reviews.apache.org/r/45667/diff/1/?file=1323897#file1323897line520>
> >
> > Use Assume.assumeTrue
TestSecondarySort#testCustomPartitionerWithSort pass in spark mode now, need not skip this test
- kelly
-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
https://reviews.apache.org/r/45667/#review134255
-----------------------------------------------------------
On June 14, 2016, 4:39 a.m., Pallavi Rao wrote:
>
> -----------------------------------------------------------
> This is an automatically generated e-mail. To reply, visit:
> https://reviews.apache.org/r/45667/
> -----------------------------------------------------------
>
> (Updated June 14, 2016, 4:39 a.m.)
>
>
> Review request for pig, Daniel Dai and Rohini Palaniswamy.
>
>
> Bugs: PIG-4059 and PIG-4854
> https://issues.apache.org/jira/browse/PIG-4059
> https://issues.apache.org/jira/browse/PIG-4854
>
>
> Repository: pig-git
>
>
> Description
> -------
>
> The patch contains all the work done in the spark branch, so far.
>
>
> Diffs
> -----
>
> bin/pig 81f1426
> build.xml 99ba1f4
> ivy.xml dd9878e
> ivy/libraries.properties 3a819a5
> shims/test/hadoop20/org/apache/pig/test/SparkMiniCluster.java PRE-CREATION
> shims/test/hadoop23/org/apache/pig/test/SparkMiniCluster.java PRE-CREATION
> shims/test/hadoop23/org/apache/pig/test/TezMiniCluster.java 792a1bd
> shims/test/hadoop23/org/apache/pig/test/YarnMiniCluster.java PRE-CREATION
> src/META-INF/services/org.apache.pig.ExecType 5c034c8
> src/docs/src/documentation/content/xdocs/start.xml 36f9952
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/PhysicalOperator.java 1ff1abd
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POUserFunc.java ecf780c
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/plans/PhysicalPlan.java 2376d03
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POCollectedGroup.java bcbfe2b
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POFRJoin.java d80951a
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POForEach.java 21b75f1
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POGlobalRearrange.java 52cfb73
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POMergeJoin.java 13f70c0
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POSort.java c3a82c3
> src/org/apache/pig/backend/hadoop/executionengine/spark/JobGraphBuilder.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/JobMetricsListener.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/KryoSerializer.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/MapReducePartitionerWrapper.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkExecType.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkExecutionEngine.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLauncher.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLocalExecType.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkUtil.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/UDFJarsFinder.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/CollectedGroupConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/CounterConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/DistinctConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/FRJoinConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/FilterConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/ForEachConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/GlobalRearrangeConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/IndexedKey.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/IteratorTransform.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LimitConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LoadConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LocalRearrangeConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/MergeCogroupConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/MergeJoinConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/OutputConsumerIterator.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/PackageConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/PigSecondaryKeyComparatorSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/RDDConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/RankConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/ReduceByConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SkewedJoinConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SortConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SplitConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/StoreConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/StreamConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/UnionConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/operator/NativeSparkOperator.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/operator/POGlobalRearrangeSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/operator/POReduceBySpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/AccumulatorOptimizer.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/CombinerOptimizer.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/MultiQueryOptimizerSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/NoopFilterRemover.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/ParallelismSetter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/SecondaryKeyOptimizerSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/DotSparkPrinter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkCompiler.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkCompilerException.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOpPlanVisitor.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOperPlan.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOperator.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkPOPackageAnnotator.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkPrinter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/running/PigInputFormatSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/util/AccumulatorOptimizerUtil.java c4b44ad
> src/org/apache/pig/backend/hadoop/executionengine/util/CombinerOptimizerUtil.java 889c01b
> src/org/apache/pig/backend/hadoop/executionengine/util/SecondaryKeyOptimizerUtil.java 0b59c9c
> src/org/apache/pig/backend/hadoop/hbase/HBaseStorage.java e0581d9
> src/org/apache/pig/data/SelfSpillBag.java d17f0a8
> src/org/apache/pig/impl/PigContext.java d43949f
> src/org/apache/pig/impl/plan/OperatorPlan.java 8b2e2e7
> src/org/apache/pig/tools/pigstats/PigStatsUtil.java 542cc2e
> src/org/apache/pig/tools/pigstats/spark/SparkCounter.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkCounterGroup.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkCounters.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkJobStats.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkPigStats.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkPigStatusReporter.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkScriptState.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkStatsUtil.java PRE-CREATION
> test/e2e/pig/build.xml f7c38ba
> test/e2e/pig/conf/spark.conf PRE-CREATION
> test/e2e/pig/drivers/TestDriverPig.pm bf9c302
> test/e2e/pig/tests/streaming.conf 18f2fb2
> test/excluded-tests-spark PRE-CREATION
> test/org/apache/pig/newplan/logical/relational/TestLocationInPhysicalPlan.java 94b34b3
> test/org/apache/pig/spark/TestIndexedKey.java PRE-CREATION
> test/org/apache/pig/spark/TestSecondarySortSpark.java PRE-CREATION
> test/org/apache/pig/test/MiniGenericCluster.java 9347269
> test/org/apache/pig/test/TestAssert.java 6d4b5c6
> test/org/apache/pig/test/TestBuiltin.java fbc3f1e
> test/org/apache/pig/test/TestCase.java c9bb2fa
> test/org/apache/pig/test/TestCollectedGroup.java a958d33
> test/org/apache/pig/test/TestCombiner.java df44293
> test/org/apache/pig/test/TestCubeOperator.java de96e6c
> test/org/apache/pig/test/TestEvalPipeline.java 9efde13
> test/org/apache/pig/test/TestEvalPipeline2.java c8f51d7
> test/org/apache/pig/test/TestEvalPipelineLocal.java c12d595
> test/org/apache/pig/test/TestFinish.java f18c103
> test/org/apache/pig/test/TestForEachNestedPlanLocal.java b0aa3a8
> test/org/apache/pig/test/TestGrunt.java 9eaf298
> test/org/apache/pig/test/TestHBaseStorage.java 8d2ad85
> test/org/apache/pig/test/TestLimitVariable.java 53b9dae
> test/org/apache/pig/test/TestMapSideCogroup.java 2c78b4a
> test/org/apache/pig/test/TestMergeJoin.java f1a9608
> test/org/apache/pig/test/TestMergeJoinOuter.java 81aee55
> test/org/apache/pig/test/TestMultiQuery.java c32eab7
> test/org/apache/pig/test/TestMultiQueryLocal.java b9ac035
> test/org/apache/pig/test/TestNativeMapReduce.java c4f6573
> test/org/apache/pig/test/TestNullConstant.java 3ea4509
> test/org/apache/pig/test/TestPigRunner.java fde8609
> test/org/apache/pig/test/TestPigServerLocal.java fbabd03
> test/org/apache/pig/test/TestProjectRange.java 2e3e7b8
> test/org/apache/pig/test/TestPruneColumn.java 3936332
> test/org/apache/pig/test/TestRank1.java 9e4ef62
> test/org/apache/pig/test/TestRank2.java fc802a9
> test/org/apache/pig/test/TestRank3.java 43af10d
> test/org/apache/pig/test/TestSecondarySort.java 8991010
> test/org/apache/pig/test/TestSkewedJoin.java dba2241
> test/org/apache/pig/test/TestStoreBase.java eb3b253
> test/org/apache/pig/test/Util.java 36d01e8
> test/spark-tests PRE-CREATION
>
> Diff: https://reviews.apache.org/r/45667/diff/
>
>
> Testing
> -------
>
> New UTs were added where required and ensure old UTs pass -> https://builds.apache.org/job/Pig-spark/
>
>
> Thanks,
>
> Pallavi Rao
>
>
Re: Review Request 45667: Support Pig On Spark
Posted by Rohini Palaniswamy <ro...@gmail.com>.
-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
https://reviews.apache.org/r/45667/#review134255
-----------------------------------------------------------
Review comments for all non-spark classes.
bin/pig (line 365)
<https://reviews.apache.org/r/45667/#comment198955>
This is not a good idea. If I remember correctly, spark-assembly.jar is 128MB+. If you are copying all the individual jars that it is made up of to distcache for every job, it will suffer bad performance as copy to hdfs and localization by NM will be very costly.
Like Tez you can have users copy the assembly jar to hdfs and specify the hdfs location. This will ensure there is only one copy in hdfs and localization is done only once per node by node manager.
bin/pig (line 426)
<https://reviews.apache.org/r/45667/#comment198956>
Why is this extra variable needed? This will be a problem if users are programatically calling PigServer instead of using bin/pig
build.xml (line 366)
<https://reviews.apache.org/r/45667/#comment198949>
Can you confirm that with all the spark changes running with hadoop 1.x (ant test -Dhadoopversion=20) is good?
build.xml (lines 401 - 415)
<https://reviews.apache.org/r/45667/#comment198950>
Please remove this. This target is supposed to contain the minimum jars required for just Pig without any hadooop, tez or spark dependencies.
build.xml (line 771)
<https://reviews.apache.org/r/45667/#comment198951>
Directory path will be build/ivy/lib/spark/Pig. You can just do build/ivy/lib/spark.
build.xml (line 1052)
<https://reviews.apache.org/r/45667/#comment198952>
spark mode
ivy.xml (line 451)
<https://reviews.apache.org/r/45667/#comment198954>
What does 2.10 in spark-core_2.10 and spark-yarn_2.10 signify?
ivy/libraries.properties (line 80)
<https://reviews.apache.org/r/45667/#comment198953>
Can we move to 1.6.1?
shims/test/hadoop20/org/apache/pig/test/SparkMiniCluster.java (line 68)
<https://reviews.apache.org/r/45667/#comment198958>
You will have to implement getLauncher method. Else will end up with infinite loop. Refer PIG-4530
shims/test/hadoop23/org/apache/pig/test/SparkMiniCluster.java (line 38)
<https://reviews.apache.org/r/45667/#comment198961>
You should move common code between TezMiniCluster and this into a YarnMiniCluster class to avoid duplication of code. TezMiniCluster and SparkMiniCluster can then extend that class and override specific methods.
shims/test/hadoop23/org/apache/pig/test/SparkMiniCluster.java (line 52)
<https://reviews.apache.org/r/45667/#comment198959>
Formatting nitpick. New line before the constructor.
src/META-INF/services/org.apache.pig.ExecType (lines 16 - 17)
<https://reviews.apache.org/r/45667/#comment198962>
Need to be uncommented back.
src/docs/src/documentation/content/xdocs/start.xml (line 99)
<https://reviews.apache.org/r/45667/#comment198963>
This file and content needs rebasing with trunk to include latest changes. Spark Local Mode should be after Tez Local Mode and Spark Mode after Tez Mode so that local modes are together as it is in the current trunk version.
src/docs/src/documentation/content/xdocs/start.xml (line 132)
<https://reviews.apache.org/r/45667/#comment198964>
six execution modes
src/docs/src/documentation/content/xdocs/start.xml (line 143)
<https://reviews.apache.org/r/45667/#comment198965>
1) Can you add a section for Spark Local Mode as well? In all other places of the documentation as well wherever there is tez_local.
2) Can we add a bulleted list for the different options of env::SPARK_MASTER for easy readability.
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POUserFunc.java (line 123)
<https://reviews.apache.org/r/45667/#comment198966>
Formatting nitpick. Space between () and {
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/plans/PhysicalPlan.java (line 153)
<https://reviews.apache.org/r/45667/#comment198967>
Can this additional method be avoided? What is the special case that this is required for?
src/org/apache/pig/backend/hadoop/executionengine/spark/JobGraphBuilder.java (line 72)
<https://reviews.apache.org/r/45667/#comment198968>
Formatting nitpick. Break into multiple lines.
src/org/apache/pig/backend/hadoop/executionengine/util/AccumulatorOptimizerUtil.java (line 294)
<https://reviews.apache.org/r/45667/#comment199079>
The utility class is for common code between mapreduce, tez and spark. Please move all Spark specific code to the spark AccumulatorOptimizer
src/org/apache/pig/backend/hadoop/executionengine/util/SecondaryKeyOptimizerUtil.java (line 59)
<https://reviews.apache.org/r/45667/#comment199080>
It is not a good idea to use a static variable to keep state in this static class. Multi-threaded use of Pig will have issues if different modes are used.
This utility class is for code common to mapreduce, tez and spark. Please move the spark specific code to spark's SecondaryKeyOptimizer. This is also necessary if we ever complete the mavenization and move tez and spark specific code to separate modules.
src/org/apache/pig/backend/hadoop/hbase/HBaseStorage.java (line 313)
<https://reviews.apache.org/r/45667/#comment199081>
Just change the code to use UDFContext.getUDFContext().getJobConf() which should not be null instead of getClientSystemProps(). Not sure why it is using getClientSystemProps() in the first place.
src/org/apache/pig/data/SelfSpillBag.java (line 32)
<https://reviews.apache.org/r/45667/#comment199082>
Why is bag even being serialized by Spark?
src/org/apache/pig/impl/PigContext.java (line 924)
<https://reviews.apache.org/r/45667/#comment199083>
This can be reverted. PigContext need not be serialized to the backend. See PIG-4866
test/e2e/pig/conf/spark.conf (line 58)
<https://reviews.apache.org/r/45667/#comment199031>
Why local instead of mapreduce?
test/org/apache/pig/newplan/logical/relational/TestLocationInPhysicalPlan.java (line 66)
<https://reviews.apache.org/r/45667/#comment199032>
Why does A[3,4] repeat?
test/org/apache/pig/spark/TestIndexedKey.java (lines 2 - 5)
<https://reviews.apache.org/r/45667/#comment199036>
Keep license formatting same in all the files. Might cause issue with RAT report.
test/org/apache/pig/test/TestBuiltin.java (line 3248)
<https://reviews.apache.org/r/45667/#comment199039>
Jira number does not seem to be the right one.
test/org/apache/pig/test/TestBuiltin.java (line 3255)
<https://reviews.apache.org/r/45667/#comment199040>
This testcase is broken if you have 0-0 repeating twice. It is not UniqueID anymore.
test/org/apache/pig/test/TestCollectedGroup.java (line 297)
<https://reviews.apache.org/r/45667/#comment199041>
Please use Assume.assumeTrue to skip tests for spark instead of if conditions. The test will also show up as Skipped instead of Passed.
Assume.assumeTrue("Skip this test for Spark. See <jira>", !Util.isSparkExecType(cluster.getExecType()));
test/org/apache/pig/test/TestCombiner.java (lines 189 - 191)
<https://reviews.apache.org/r/45667/#comment199043>
Can be removed with the change to checkCombinerUsed
test/org/apache/pig/test/TestCombiner.java (lines 239 - 241)
<https://reviews.apache.org/r/45667/#comment199044>
Can be removed with the change to checkCombinerUsed
test/org/apache/pig/test/TestCombiner.java (line 409)
<https://reviews.apache.org/r/45667/#comment199046>
Can we use the name "alias" instead of "variable" here?
test/org/apache/pig/test/TestCombiner.java (line 416)
<https://reviews.apache.org/r/45667/#comment199042>
Util.isSparkExecType
test/org/apache/pig/test/TestEvalPipeline.java (lines 431 - 436)
<https://reviews.apache.org/r/45667/#comment199053>
Can be extracted to a utility method as it is repeated multiple times.
test/org/apache/pig/test/TestEvalPipeline2.java (line 770)
<https://reviews.apache.org/r/45667/#comment199055>
Can be simplified as
Util.checkQueryOutputs(iter, expectedResults, Util.isSparkExecType());
by adding another Util method. Please replace in other tests as well where it has been changed to Util.checkQueryOutputsAfterSort instead of Util.checkQueryOutputs.
Util.java
public static void checkQueryOutputs(Iterator<Tuple> actualResults, Tuple[] expectedResults, boolean checkAfterSort) {
if (checkAfterSort) {
checkQueryOutputsAfterSort(actualResults, Arrays.asList(expectedResults));
} else {
checkQueryOutputs(actualResults, Arrays.asList(expectedResults));
}
}
test/org/apache/pig/test/TestEvalPipelineLocal.java (lines 1117 - 1118)
<https://reviews.apache.org/r/45667/#comment199056>
Code should be taking care of this.
test/org/apache/pig/test/TestFinish.java (line 48)
<https://reviews.apache.org/r/45667/#comment199058>
private static
test/org/apache/pig/test/TestFinish.java (line 72)
<https://reviews.apache.org/r/45667/#comment199057>
This should not be done. The cluster object is not something a user UDF will have access to.
test/org/apache/pig/test/TestFinish.java (line 144)
<https://reviews.apache.org/r/45667/#comment199059>
To be reverted
test/org/apache/pig/test/TestFinish.java (line 163)
<https://reviews.apache.org/r/45667/#comment199060>
To be reverted
test/org/apache/pig/test/TestGrunt.java (line 937)
<https://reviews.apache.org/r/45667/#comment199061>
Assert.assumeTrue
test/org/apache/pig/test/TestLimitVariable.java (line 93)
<https://reviews.apache.org/r/45667/#comment199062>
Please revert the type change to int. This testcase is testing bytearray casting.
test/org/apache/pig/test/TestLimitVariable.java (line 117)
<https://reviews.apache.org/r/45667/#comment199063>
Please revert the type change to int. This testcase is testing bytearray casting.
test/org/apache/pig/test/TestMapSideCogroup.java (line 339)
<https://reviews.apache.org/r/45667/#comment199066>
Why is output ordering different with merge join? The assumption with merge join is that files are sorted and join is done based on that. So ordering should be same.
test/org/apache/pig/test/TestMergeJoin.java (line 641)
<https://reviews.apache.org/r/45667/#comment199064>
Assume.assumeTrue
test/org/apache/pig/test/TestMergeJoin.java (line 690)
<https://reviews.apache.org/r/45667/#comment199065>
Assume.assumeTrue
test/org/apache/pig/test/TestMergeJoinOuter.java (line 173)
<https://reviews.apache.org/r/45667/#comment199068>
Assume.assumeTrue for all tests in this class
test/org/apache/pig/test/TestMultiQuery.java (line 115)
<https://reviews.apache.org/r/45667/#comment199070>
Why are we using checkQueryOutputsAfterSortRecursive in many places when checkQueryOutputsAfterSort would do? It will unnecessarily increase the test execution time. Can they all be changed? Or am I missing something and checkQueryOutputsAfterSort cannot be used for some reason?
test/org/apache/pig/test/TestPigRunner.java (lines 333 - 334)
<https://reviews.apache.org/r/45667/#comment199074>
1 job for 1 POStore can impact performance. It is not exactly multi-query. Is there a jira to optimize this better and use 1 job?
test/org/apache/pig/test/TestPigRunner.java (line 1155)
<https://reviews.apache.org/r/45667/#comment199075>
Is there a jira to fix the number of records?
test/org/apache/pig/test/TestPigServerLocal.java (line 265)
<https://reviews.apache.org/r/45667/#comment199076>
Can you add _testParseBatchWithScripting as well?
test/org/apache/pig/test/TestSecondarySort.java (line 364)
<https://reviews.apache.org/r/45667/#comment199052>
It is not right to check output after sorting here. This is an order by query. Output should already be in the expected order.
test/org/apache/pig/test/TestSecondarySort.java (line 512)
<https://reviews.apache.org/r/45667/#comment199051>
Use Assume.assumeTrue
test/org/apache/pig/test/TestSkewedJoin.java (line 290)
<https://reviews.apache.org/r/45667/#comment199050>
Use Assume.assumeTrue
test/org/apache/pig/test/TestStoreBase.java (line 155)
<https://reviews.apache.org/r/45667/#comment199047>
Second store should not succeed.
test/org/apache/pig/test/TestStoreBase.java (line 165)
<https://reviews.apache.org/r/45667/#comment199049>
Setup job file of second one should also be there. Same with setup task as well. They are supposed to happen before putNext of both the stores is called.
- Rohini Palaniswamy
On April 4, 2016, 5:19 a.m., Pallavi Rao wrote:
>
> -----------------------------------------------------------
> This is an automatically generated e-mail. To reply, visit:
> https://reviews.apache.org/r/45667/
> -----------------------------------------------------------
>
> (Updated April 4, 2016, 5:19 a.m.)
>
>
> Review request for pig, Daniel Dai and Rohini Palaniswamy.
>
>
> Bugs: PIG-4059 and PIG-4854
> https://issues.apache.org/jira/browse/PIG-4059
> https://issues.apache.org/jira/browse/PIG-4854
>
>
> Repository: pig-git
>
>
> Description
> -------
>
> The patch contains all the work done in the spark branch, so far.
>
>
> Diffs
> -----
>
> bin/pig 81f1426
> build.xml 8db1a80
> ivy.xml dd9878e
> ivy/libraries.properties 55d9aed
> shims/test/hadoop20/org/apache/pig/test/SparkMiniCluster.java PRE-CREATION
> shims/test/hadoop23/org/apache/pig/test/SparkMiniCluster.java PRE-CREATION
> src/META-INF/services/org.apache.pig.ExecType 5c034c8
> src/docs/src/documentation/content/xdocs/start.xml 36f9952
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/PhysicalOperator.java 1ff1abd
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POUserFunc.java ecf780c
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/plans/PhysicalPlan.java 2376d03
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POCollectedGroup.java bcbfe2b
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POFRJoin.java 894cda7
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POForEach.java 21b75f1
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POGlobalRearrange.java 52cfb73
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POMergeJoin.java 6adfa91
> src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POSort.java c3a82c3
> src/org/apache/pig/backend/hadoop/executionengine/spark/JobGraphBuilder.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/JobMetricsListener.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/KryoSerializer.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/MapReducePartitionerWrapper.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkExecType.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkExecutionEngine.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLauncher.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLocalExecType.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/SparkUtil.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/UDFJarsFinder.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/CollectedGroupConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/CounterConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/DistinctConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/FRJoinConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/FilterConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/ForEachConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/GlobalRearrangeConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/IndexedKey.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/IteratorTransform.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LimitConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LoadConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LocalRearrangeConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/MergeCogroupConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/MergeJoinConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/OutputConsumerIterator.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/PackageConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/PigSecondaryKeyComparatorSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/RDDConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/RankConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/ReduceByConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SkewedJoinConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SortConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SplitConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/StoreConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/StreamConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/converter/UnionConverter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/operator/NativeSparkOperator.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/operator/POGlobalRearrangeSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/operator/POReduceBySpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/AccumulatorOptimizer.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/CombinerOptimizer.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/MultiQueryOptimizerSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/NoopFilterRemover.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/ParallelismSetter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/SecondaryKeyOptimizerSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/DotSparkPrinter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkCompiler.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkCompilerException.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOpPlanVisitor.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOperPlan.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOperator.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkPOPackageAnnotator.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkPrinter.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/spark/running/PigInputFormatSpark.java PRE-CREATION
> src/org/apache/pig/backend/hadoop/executionengine/util/AccumulatorOptimizerUtil.java c4b44ad
> src/org/apache/pig/backend/hadoop/executionengine/util/CombinerOptimizerUtil.java 889c01b
> src/org/apache/pig/backend/hadoop/executionengine/util/SecondaryKeyOptimizerUtil.java 0b59c9c
> src/org/apache/pig/backend/hadoop/hbase/HBaseStorage.java e0581d9
> src/org/apache/pig/data/SelfSpillBag.java d17f0a8
> src/org/apache/pig/impl/PigContext.java d43949f
> src/org/apache/pig/impl/plan/OperatorPlan.java 8b2e2e7
> src/org/apache/pig/tools/pigstats/PigStatsUtil.java 542cc2e
> src/org/apache/pig/tools/pigstats/spark/SparkCounter.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkCounterGroup.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkCounters.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkJobStats.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkPigStats.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkPigStatusReporter.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkScriptState.java PRE-CREATION
> src/org/apache/pig/tools/pigstats/spark/SparkStatsUtil.java PRE-CREATION
> test/e2e/pig/build.xml f7c38ba
> test/e2e/pig/conf/spark.conf PRE-CREATION
> test/e2e/pig/drivers/TestDriverPig.pm bf9c302
> test/e2e/pig/tests/streaming.conf 18f2fb2
> test/excluded-tests-spark PRE-CREATION
> test/org/apache/pig/newplan/logical/relational/TestLocationInPhysicalPlan.java 94b34b3
> test/org/apache/pig/spark/TestIndexedKey.java PRE-CREATION
> test/org/apache/pig/spark/TestSecondarySortSpark.java PRE-CREATION
> test/org/apache/pig/test/MiniGenericCluster.java 9347269
> test/org/apache/pig/test/TestAssert.java 6d4b5c6
> test/org/apache/pig/test/TestBuiltin.java 44b4d09
> test/org/apache/pig/test/TestCase.java c9bb2fa
> test/org/apache/pig/test/TestCollectedGroup.java a958d33
> test/org/apache/pig/test/TestCombiner.java df44293
> test/org/apache/pig/test/TestCubeOperator.java de96e6c
> test/org/apache/pig/test/TestEvalPipeline.java 9efde13
> test/org/apache/pig/test/TestEvalPipeline2.java c8f51d7
> test/org/apache/pig/test/TestEvalPipelineLocal.java c12d595
> test/org/apache/pig/test/TestFinish.java f18c103
> test/org/apache/pig/test/TestForEachNestedPlanLocal.java b0aa3a8
> test/org/apache/pig/test/TestGrunt.java ef121a3
> test/org/apache/pig/test/TestHBaseStorage.java 8d2ad85
> test/org/apache/pig/test/TestLimitVariable.java 53b9dae
> test/org/apache/pig/test/TestMapSideCogroup.java 2c78b4a
> test/org/apache/pig/test/TestMergeJoin.java f1a9608
> test/org/apache/pig/test/TestMergeJoinOuter.java 81aee55
> test/org/apache/pig/test/TestMultiQuery.java 40684b4
> test/org/apache/pig/test/TestMultiQueryLocal.java b9ac035
> test/org/apache/pig/test/TestNativeMapReduce.java c4f6573
> test/org/apache/pig/test/TestNullConstant.java 3ea4509
> test/org/apache/pig/test/TestPigRunner.java fde8609
> test/org/apache/pig/test/TestPigServerLocal.java fbabd03
> test/org/apache/pig/test/TestProjectRange.java 2e3e7b8
> test/org/apache/pig/test/TestPruneColumn.java 3936332
> test/org/apache/pig/test/TestRank1.java 9e4ef62
> test/org/apache/pig/test/TestRank2.java fc802a9
> test/org/apache/pig/test/TestRank3.java 43af10d
> test/org/apache/pig/test/TestSecondarySort.java 8991010
> test/org/apache/pig/test/TestSkewedJoin.java dba2241
> test/org/apache/pig/test/TestStoreBase.java eb3b253
> test/org/apache/pig/test/Util.java 8dae247
> test/spark-tests PRE-CREATION
>
> Diff: https://reviews.apache.org/r/45667/diff/
>
>
> Testing
> -------
>
> New UTs were added where required and ensure old UTs pass -> https://builds.apache.org/job/Pig-spark/
>
>
> Thanks,
>
> Pallavi Rao
>
>
Re: Review Request 45667: Support Pig On Spark
Posted by Pallavi Rao <pa...@inmobi.com>.
-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
https://reviews.apache.org/r/45667/
-----------------------------------------------------------
(Updated Oct. 27, 2016, 3:17 a.m.)
Review request for pig, Daniel Dai and Rohini Palaniswamy.
Bugs: PIG-4059 and PIG-4854
https://issues.apache.org/jira/browse/PIG-4059
https://issues.apache.org/jira/browse/PIG-4854
Repository: pig-git
Description
-------
The patch contains all the work done in the spark branch, so far.
Diffs (updated)
-----
bin/pig 81f1426
build.xml 99ba1f4
ivy.xml dd9878e
ivy/libraries.properties 3a819a5
shims/test/hadoop20/org/apache/pig/test/SparkMiniCluster.java PRE-CREATION
shims/test/hadoop23/org/apache/pig/test/SparkMiniCluster.java PRE-CREATION
shims/test/hadoop23/org/apache/pig/test/TezMiniCluster.java 792a1bd
shims/test/hadoop23/org/apache/pig/test/YarnMiniCluster.java PRE-CREATION
src/META-INF/services/org.apache.pig.ExecType 5c034c8
src/docs/src/documentation/content/xdocs/start.xml 36f9952
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/PhysicalOperator.java 1ff1abd
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POUserFunc.java ecf780c
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/plans/PhysicalPlan.java 2376d03
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POCollectedGroup.java bcbfe2b
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POFRJoin.java d80951a
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POForEach.java 21b75f1
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POGlobalRearrange.java 52cfb73
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POMergeJoin.java 13f70c0
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POSort.java c3a82c3
src/org/apache/pig/backend/hadoop/executionengine/spark/JobGraphBuilder.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/JobMetricsListener.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/KryoSerializer.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/MapReducePartitionerWrapper.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/SparkEngineConf.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/SparkExecType.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/SparkExecutionEngine.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLauncher.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLocalExecType.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/SparkUtil.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/UDFJarsFinder.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/CollectedGroupConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/CounterConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/DistinctConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/FRJoinConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/FilterConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/ForEachConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/GlobalRearrangeConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/IndexedKey.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/IteratorTransform.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LimitConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LoadConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LocalRearrangeConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/MergeCogroupConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/MergeJoinConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/OutputConsumerIterator.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/PackageConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/PigSecondaryKeyComparatorSpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/RDDConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/RankConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/ReduceByConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SkewedJoinConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SortConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SplitConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/StoreConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/StreamConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/UnionConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/operator/NativeSparkOperator.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/operator/POGlobalRearrangeSpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/operator/POReduceBySpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/AccumulatorOptimizer.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/CombinerOptimizer.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/MultiQueryOptimizerSpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/NoopFilterRemover.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/ParallelismSetter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/SecondaryKeyOptimizerSpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/DotSparkPrinter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkCompiler.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkCompilerException.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOpPlanVisitor.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOperPlan.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOperator.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkPOPackageAnnotator.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkPrinter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/running/PigInputFormatSpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/util/AccumulatorOptimizerUtil.java c4b44ad
src/org/apache/pig/backend/hadoop/executionengine/util/CombinerOptimizerUtil.java 889c01b
src/org/apache/pig/backend/hadoop/executionengine/util/SecondaryKeyOptimizerUtil.java 0b59c9c
src/org/apache/pig/data/SelfSpillBag.java d17f0a8
src/org/apache/pig/impl/plan/OperatorPlan.java 8b2e2e7
src/org/apache/pig/impl/util/UDFContext.java 09afc0a
src/org/apache/pig/tools/pigstats/PigStatsUtil.java 542cc2e
src/org/apache/pig/tools/pigstats/spark/SparkCounter.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkCounterGroup.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkCounters.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkJobStats.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkPigStats.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkPigStatusReporter.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkScriptState.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkStatsUtil.java PRE-CREATION
test/e2e/pig/build.xml f7c38ba
test/e2e/pig/conf/spark.conf PRE-CREATION
test/e2e/pig/drivers/TestDriverPig.pm bf9c302
test/e2e/pig/tests/streaming.conf 18f2fb2
test/excluded-tests-spark PRE-CREATION
test/org/apache/pig/newplan/logical/relational/TestLocationInPhysicalPlan.java 94b34b3
test/org/apache/pig/spark/TestIndexedKey.java PRE-CREATION
test/org/apache/pig/spark/TestSecondarySortSpark.java PRE-CREATION
test/org/apache/pig/test/MiniGenericCluster.java 9347269
test/org/apache/pig/test/TestAssert.java 6d4b5c6
test/org/apache/pig/test/TestBuiltin.java fbc3f1e
test/org/apache/pig/test/TestCase.java c9bb2fa
test/org/apache/pig/test/TestCollectedGroup.java a958d33
test/org/apache/pig/test/TestCombiner.java df44293
test/org/apache/pig/test/TestCubeOperator.java de96e6c
test/org/apache/pig/test/TestEvalPipeline.java 9efde13
test/org/apache/pig/test/TestEvalPipeline2.java c8f51d7
test/org/apache/pig/test/TestEvalPipelineLocal.java c12d595
test/org/apache/pig/test/TestFinish.java f18c103
test/org/apache/pig/test/TestForEachNestedPlanLocal.java b0aa3a8
test/org/apache/pig/test/TestGrunt.java 9eaf298
test/org/apache/pig/test/TestHBaseStorage.java 8d2ad85
test/org/apache/pig/test/TestLimitVariable.java 53b9dae
test/org/apache/pig/test/TestMapSideCogroup.java 2c78b4a
test/org/apache/pig/test/TestMergeJoin.java f1a9608
test/org/apache/pig/test/TestMergeJoinOuter.java 81aee55
test/org/apache/pig/test/TestMultiQuery.java c32eab7
test/org/apache/pig/test/TestMultiQueryLocal.java b9ac035
test/org/apache/pig/test/TestNativeMapReduce.java c4f6573
test/org/apache/pig/test/TestNullConstant.java 3ea4509
test/org/apache/pig/test/TestPigRunner.java fde8609
test/org/apache/pig/test/TestPigServerLocal.java fbabd03
test/org/apache/pig/test/TestProjectRange.java 2e3e7b8
test/org/apache/pig/test/TestPruneColumn.java 3936332
test/org/apache/pig/test/TestRank1.java 9e4ef62
test/org/apache/pig/test/TestRank2.java fc802a9
test/org/apache/pig/test/TestRank3.java 43af10d
test/org/apache/pig/test/TestSecondarySort.java 8991010
test/org/apache/pig/test/TestSkewedJoin.java dba2241
test/org/apache/pig/test/TestStoreBase.java eb3b253
test/org/apache/pig/test/Util.java 36d01e8
test/spark-tests PRE-CREATION
Diff: https://reviews.apache.org/r/45667/diff/
Testing
-------
New UTs were added where required and ensure old UTs pass -> https://builds.apache.org/job/Pig-spark/
Thanks,
Pallavi Rao
Re: Review Request 45667: Support Pig On Spark
Posted by Pallavi Rao <pa...@inmobi.com>.
-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
https://reviews.apache.org/r/45667/
-----------------------------------------------------------
(Updated July 11, 2016, 4:32 a.m.)
Review request for pig, Daniel Dai and Rohini Palaniswamy.
Changes
-------
Addressed some more review comments
Bugs: PIG-4059 and PIG-4854
https://issues.apache.org/jira/browse/PIG-4059
https://issues.apache.org/jira/browse/PIG-4854
Repository: pig-git
Description
-------
The patch contains all the work done in the spark branch, so far.
Diffs (updated)
-----
bin/pig 81f1426
build.xml 99ba1f4
ivy.xml dd9878e
ivy/libraries.properties 3a819a5
shims/test/hadoop20/org/apache/pig/test/SparkMiniCluster.java PRE-CREATION
shims/test/hadoop23/org/apache/pig/test/SparkMiniCluster.java PRE-CREATION
shims/test/hadoop23/org/apache/pig/test/TezMiniCluster.java 792a1bd
shims/test/hadoop23/org/apache/pig/test/YarnMiniCluster.java PRE-CREATION
src/META-INF/services/org.apache.pig.ExecType 5c034c8
src/docs/src/documentation/content/xdocs/start.xml 36f9952
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/PhysicalOperator.java 1ff1abd
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POUserFunc.java ecf780c
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/plans/PhysicalPlan.java 2376d03
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POCollectedGroup.java bcbfe2b
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POFRJoin.java d80951a
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POForEach.java 21b75f1
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POGlobalRearrange.java 52cfb73
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POMergeJoin.java 13f70c0
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POSort.java c3a82c3
src/org/apache/pig/backend/hadoop/executionengine/spark/JobGraphBuilder.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/JobMetricsListener.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/KryoSerializer.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/MapReducePartitionerWrapper.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/SparkExecType.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/SparkExecutionEngine.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLauncher.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLocalExecType.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/SparkUtil.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/UDFJarsFinder.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/CollectedGroupConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/CounterConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/DistinctConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/FRJoinConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/FilterConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/ForEachConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/GlobalRearrangeConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/IndexedKey.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/IteratorTransform.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LimitConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LoadConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LocalRearrangeConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/MergeCogroupConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/MergeJoinConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/OutputConsumerIterator.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/PackageConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/PigSecondaryKeyComparatorSpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/RDDConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/RankConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/ReduceByConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SkewedJoinConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SortConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SplitConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/StoreConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/StreamConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/UnionConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/operator/NativeSparkOperator.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/operator/POGlobalRearrangeSpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/operator/POReduceBySpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/AccumulatorOptimizer.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/CombinerOptimizer.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/MultiQueryOptimizerSpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/NoopFilterRemover.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/ParallelismSetter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/SecondaryKeyOptimizerSpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/DotSparkPrinter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkCompiler.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkCompilerException.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOpPlanVisitor.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOperPlan.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOperator.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkPOPackageAnnotator.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkPrinter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/running/PigInputFormatSpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/util/AccumulatorOptimizerUtil.java c4b44ad
src/org/apache/pig/backend/hadoop/executionengine/util/CombinerOptimizerUtil.java 889c01b
src/org/apache/pig/backend/hadoop/executionengine/util/SecondaryKeyOptimizerUtil.java 0b59c9c
src/org/apache/pig/backend/hadoop/hbase/HBaseStorage.java e0581d9
src/org/apache/pig/data/SelfSpillBag.java d17f0a8
src/org/apache/pig/impl/PigContext.java d43949f
src/org/apache/pig/impl/plan/OperatorPlan.java 8b2e2e7
src/org/apache/pig/tools/pigstats/PigStatsUtil.java 542cc2e
src/org/apache/pig/tools/pigstats/spark/SparkCounter.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkCounterGroup.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkCounters.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkJobStats.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkPigStats.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkPigStatusReporter.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkScriptState.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkStatsUtil.java PRE-CREATION
test/e2e/pig/build.xml f7c38ba
test/e2e/pig/conf/spark.conf PRE-CREATION
test/e2e/pig/drivers/TestDriverPig.pm bf9c302
test/e2e/pig/tests/streaming.conf 18f2fb2
test/excluded-tests-spark PRE-CREATION
test/org/apache/pig/newplan/logical/relational/TestLocationInPhysicalPlan.java 94b34b3
test/org/apache/pig/spark/TestIndexedKey.java PRE-CREATION
test/org/apache/pig/spark/TestSecondarySortSpark.java PRE-CREATION
test/org/apache/pig/test/MiniGenericCluster.java 9347269
test/org/apache/pig/test/TestAssert.java 6d4b5c6
test/org/apache/pig/test/TestBuiltin.java fbc3f1e
test/org/apache/pig/test/TestCase.java c9bb2fa
test/org/apache/pig/test/TestCollectedGroup.java a958d33
test/org/apache/pig/test/TestCombiner.java df44293
test/org/apache/pig/test/TestCubeOperator.java de96e6c
test/org/apache/pig/test/TestEvalPipeline.java 9efde13
test/org/apache/pig/test/TestEvalPipeline2.java c8f51d7
test/org/apache/pig/test/TestEvalPipelineLocal.java c12d595
test/org/apache/pig/test/TestFinish.java f18c103
test/org/apache/pig/test/TestForEachNestedPlanLocal.java b0aa3a8
test/org/apache/pig/test/TestGrunt.java 9eaf298
test/org/apache/pig/test/TestHBaseStorage.java 8d2ad85
test/org/apache/pig/test/TestLimitVariable.java 53b9dae
test/org/apache/pig/test/TestMapSideCogroup.java 2c78b4a
test/org/apache/pig/test/TestMergeJoin.java f1a9608
test/org/apache/pig/test/TestMergeJoinOuter.java 81aee55
test/org/apache/pig/test/TestMultiQuery.java c32eab7
test/org/apache/pig/test/TestMultiQueryLocal.java b9ac035
test/org/apache/pig/test/TestNativeMapReduce.java c4f6573
test/org/apache/pig/test/TestNullConstant.java 3ea4509
test/org/apache/pig/test/TestPigRunner.java fde8609
test/org/apache/pig/test/TestPigServerLocal.java fbabd03
test/org/apache/pig/test/TestProjectRange.java 2e3e7b8
test/org/apache/pig/test/TestPruneColumn.java 3936332
test/org/apache/pig/test/TestRank1.java 9e4ef62
test/org/apache/pig/test/TestRank2.java fc802a9
test/org/apache/pig/test/TestRank3.java 43af10d
test/org/apache/pig/test/TestSecondarySort.java 8991010
test/org/apache/pig/test/TestSkewedJoin.java dba2241
test/org/apache/pig/test/TestStoreBase.java eb3b253
test/org/apache/pig/test/Util.java 36d01e8
test/spark-tests PRE-CREATION
Diff: https://reviews.apache.org/r/45667/diff/
Testing
-------
New UTs were added where required and ensure old UTs pass -> https://builds.apache.org/job/Pig-spark/
Thanks,
Pallavi Rao
Re: Review Request 45667: Support Pig On Spark
Posted by Pallavi Rao <pa...@inmobi.com>.
-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
https://reviews.apache.org/r/45667/
-----------------------------------------------------------
(Updated June 14, 2016, 4:39 a.m.)
Review request for pig, Daniel Dai and Rohini Palaniswamy.
Changes
-------
Uploading patch on behalf of Kelly : The new patch has ~70% of the review comments addressed.
Bugs: PIG-4059 and PIG-4854
https://issues.apache.org/jira/browse/PIG-4059
https://issues.apache.org/jira/browse/PIG-4854
Repository: pig-git
Description
-------
The patch contains all the work done in the spark branch, so far.
Diffs (updated)
-----
bin/pig 81f1426
build.xml 99ba1f4
ivy.xml dd9878e
ivy/libraries.properties 3a819a5
shims/test/hadoop20/org/apache/pig/test/SparkMiniCluster.java PRE-CREATION
shims/test/hadoop23/org/apache/pig/test/SparkMiniCluster.java PRE-CREATION
shims/test/hadoop23/org/apache/pig/test/TezMiniCluster.java 792a1bd
shims/test/hadoop23/org/apache/pig/test/YarnMiniCluster.java PRE-CREATION
src/META-INF/services/org.apache.pig.ExecType 5c034c8
src/docs/src/documentation/content/xdocs/start.xml 36f9952
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/PhysicalOperator.java 1ff1abd
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/expressionOperators/POUserFunc.java ecf780c
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/plans/PhysicalPlan.java 2376d03
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POCollectedGroup.java bcbfe2b
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POFRJoin.java d80951a
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POForEach.java 21b75f1
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POGlobalRearrange.java 52cfb73
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POMergeJoin.java 13f70c0
src/org/apache/pig/backend/hadoop/executionengine/physicalLayer/relationalOperators/POSort.java c3a82c3
src/org/apache/pig/backend/hadoop/executionengine/spark/JobGraphBuilder.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/JobMetricsListener.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/KryoSerializer.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/MapReducePartitionerWrapper.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/SparkExecType.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/SparkExecutionEngine.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLauncher.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/SparkLocalExecType.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/SparkUtil.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/UDFJarsFinder.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/CollectedGroupConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/CounterConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/DistinctConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/FRJoinConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/FilterConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/ForEachConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/GlobalRearrangeConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/IndexedKey.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/IteratorTransform.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LimitConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LoadConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/LocalRearrangeConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/MergeCogroupConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/MergeJoinConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/OutputConsumerIterator.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/PackageConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/PigSecondaryKeyComparatorSpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/RDDConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/RankConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/ReduceByConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SkewedJoinConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SortConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/SplitConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/StoreConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/StreamConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/converter/UnionConverter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/operator/NativeSparkOperator.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/operator/POGlobalRearrangeSpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/operator/POReduceBySpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/AccumulatorOptimizer.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/CombinerOptimizer.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/MultiQueryOptimizerSpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/NoopFilterRemover.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/ParallelismSetter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/optimizer/SecondaryKeyOptimizerSpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/DotSparkPrinter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkCompiler.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkCompilerException.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOpPlanVisitor.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOperPlan.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkOperator.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkPOPackageAnnotator.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/plan/SparkPrinter.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/spark/running/PigInputFormatSpark.java PRE-CREATION
src/org/apache/pig/backend/hadoop/executionengine/util/AccumulatorOptimizerUtil.java c4b44ad
src/org/apache/pig/backend/hadoop/executionengine/util/CombinerOptimizerUtil.java 889c01b
src/org/apache/pig/backend/hadoop/executionengine/util/SecondaryKeyOptimizerUtil.java 0b59c9c
src/org/apache/pig/backend/hadoop/hbase/HBaseStorage.java e0581d9
src/org/apache/pig/data/SelfSpillBag.java d17f0a8
src/org/apache/pig/impl/PigContext.java d43949f
src/org/apache/pig/impl/plan/OperatorPlan.java 8b2e2e7
src/org/apache/pig/tools/pigstats/PigStatsUtil.java 542cc2e
src/org/apache/pig/tools/pigstats/spark/SparkCounter.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkCounterGroup.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkCounters.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkJobStats.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkPigStats.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkPigStatusReporter.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkScriptState.java PRE-CREATION
src/org/apache/pig/tools/pigstats/spark/SparkStatsUtil.java PRE-CREATION
test/e2e/pig/build.xml f7c38ba
test/e2e/pig/conf/spark.conf PRE-CREATION
test/e2e/pig/drivers/TestDriverPig.pm bf9c302
test/e2e/pig/tests/streaming.conf 18f2fb2
test/excluded-tests-spark PRE-CREATION
test/org/apache/pig/newplan/logical/relational/TestLocationInPhysicalPlan.java 94b34b3
test/org/apache/pig/spark/TestIndexedKey.java PRE-CREATION
test/org/apache/pig/spark/TestSecondarySortSpark.java PRE-CREATION
test/org/apache/pig/test/MiniGenericCluster.java 9347269
test/org/apache/pig/test/TestAssert.java 6d4b5c6
test/org/apache/pig/test/TestBuiltin.java fbc3f1e
test/org/apache/pig/test/TestCase.java c9bb2fa
test/org/apache/pig/test/TestCollectedGroup.java a958d33
test/org/apache/pig/test/TestCombiner.java df44293
test/org/apache/pig/test/TestCubeOperator.java de96e6c
test/org/apache/pig/test/TestEvalPipeline.java 9efde13
test/org/apache/pig/test/TestEvalPipeline2.java c8f51d7
test/org/apache/pig/test/TestEvalPipelineLocal.java c12d595
test/org/apache/pig/test/TestFinish.java f18c103
test/org/apache/pig/test/TestForEachNestedPlanLocal.java b0aa3a8
test/org/apache/pig/test/TestGrunt.java 9eaf298
test/org/apache/pig/test/TestHBaseStorage.java 8d2ad85
test/org/apache/pig/test/TestLimitVariable.java 53b9dae
test/org/apache/pig/test/TestMapSideCogroup.java 2c78b4a
test/org/apache/pig/test/TestMergeJoin.java f1a9608
test/org/apache/pig/test/TestMergeJoinOuter.java 81aee55
test/org/apache/pig/test/TestMultiQuery.java c32eab7
test/org/apache/pig/test/TestMultiQueryLocal.java b9ac035
test/org/apache/pig/test/TestNativeMapReduce.java c4f6573
test/org/apache/pig/test/TestNullConstant.java 3ea4509
test/org/apache/pig/test/TestPigRunner.java fde8609
test/org/apache/pig/test/TestPigServerLocal.java fbabd03
test/org/apache/pig/test/TestProjectRange.java 2e3e7b8
test/org/apache/pig/test/TestPruneColumn.java 3936332
test/org/apache/pig/test/TestRank1.java 9e4ef62
test/org/apache/pig/test/TestRank2.java fc802a9
test/org/apache/pig/test/TestRank3.java 43af10d
test/org/apache/pig/test/TestSecondarySort.java 8991010
test/org/apache/pig/test/TestSkewedJoin.java dba2241
test/org/apache/pig/test/TestStoreBase.java eb3b253
test/org/apache/pig/test/Util.java 36d01e8
test/spark-tests PRE-CREATION
Diff: https://reviews.apache.org/r/45667/diff/
Testing
-------
New UTs were added where required and ensure old UTs pass -> https://builds.apache.org/job/Pig-spark/
Thanks,
Pallavi Rao