You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by ch...@apache.org on 2014/02/18 20:24:35 UTC
svn commit: r1569490 - in /pig/trunk: CHANGES.txt conf/pig.properties
src/org/apache/pig/PigConfiguration.java
src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/MRCompiler.java
Author: cheolsoo
Date: Tue Feb 18 19:24:34 2014
New Revision: 1569490
URL: http://svn.apache.org/r1569490
Log:
PIG-3648: Make the sample size for RandomSampleLoader configurable (cheolsoo)
Modified:
pig/trunk/CHANGES.txt
pig/trunk/conf/pig.properties
pig/trunk/src/org/apache/pig/PigConfiguration.java
pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/MRCompiler.java
Modified: pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1569490&r1=1569489&r2=1569490&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Tue Feb 18 19:24:34 2014
@@ -30,6 +30,8 @@ PIG-2207: Support custom counters for ag
IMPROVEMENTS
+PIG-3648: Make the sample size for RandomSampleLoader configurable (cheolsoo)
+
PIG-259: allow store to overwrite existing directroy (nezihyigitbasi via daijy)
PIG-2672: Optimize the use of DistributedCache (aniket486)
Modified: pig/trunk/conf/pig.properties
URL: http://svn.apache.org/viewvc/pig/trunk/conf/pig.properties?rev=1569490&r1=1569489&r2=1569490&view=diff
==============================================================================
--- pig/trunk/conf/pig.properties (original)
+++ pig/trunk/conf/pig.properties Tue Feb 18 19:24:34 2014
@@ -244,6 +244,10 @@ pig.location.check.strict=false
# jobs with smaller input data size to run in local mode
# pig.auto.local.input.maxbytes=100000000
+# Set this option to overwrite the sample size of RandomeSampleLoader for
+# order-by. The default value is 100 rows per task.
+# pig.random.sampler.sample.size=100
+
# When enabled, jobs won't create empty part files if no output is written. In this case
# PigOutputFormat will be wrapped with org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat.
# pig.output.lazy=true
Modified: pig/trunk/src/org/apache/pig/PigConfiguration.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/PigConfiguration.java?rev=1569490&r1=1569489&r2=1569490&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/PigConfiguration.java (original)
+++ pig/trunk/src/org/apache/pig/PigConfiguration.java Tue Feb 18 19:24:34 2014
@@ -138,6 +138,12 @@ public class PigConfiguration {
* the distributed cache when doing fragment-replicated join
*/
public static final String PIG_JOIN_REPLICATED_MAX_BYTES = "pig.join.replicated.max.bytes";
+
+ /**
+ * This key used to control the sample size of RandomeSampleLoader for
+ * order-by. The default value is 100 rows per task.
+ */
+ public static final String PIG_RANDOM_SAMPLER_SAMPLE_SIZE = "pig.random.sampler.sample.size";
/**
* This key is to turn on auto local mode feature
Modified: pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/MRCompiler.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/MRCompiler.java?rev=1569490&r1=1569489&r2=1569490&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/MRCompiler.java (original)
+++ pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/MRCompiler.java Tue Feb 18 19:24:34 2014
@@ -42,6 +42,7 @@ import org.apache.pig.FuncSpec;
import org.apache.pig.IndexableLoadFunc;
import org.apache.pig.LoadFunc;
import org.apache.pig.OrderedLoadFunc;
+import org.apache.pig.PigConfiguration;
import org.apache.pig.PigException;
import org.apache.pig.PigWarning;
import org.apache.pig.backend.executionengine.ExecException;
@@ -2346,8 +2347,9 @@ public class MRCompiler extends PhyPlanV
// as its first constructor argument.
rslargs[0] = (new FuncSpec(Utils.getTmpFileCompressorName(pigContext))).toString();
-
- rslargs[1] = "100"; // The value is calculated based on the file size for skewed join
+ // This value is only used by order by. For skewed join, it's calculated
+ // based on the file size.
+ rslargs[1] = pigContext.getProperties().getProperty(PigConfiguration.PIG_RANDOM_SAMPLER_SAMPLE_SIZE, "100");
FileSpec quantLdFilName = new FileSpec(lFile.getFileName(),
new FuncSpec(sampleLdrClassName, rslargs));