You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by ch...@apache.org on 2014/02/18 20:24:35 UTC

svn commit: r1569490 - in /pig/trunk: CHANGES.txt conf/pig.properties src/org/apache/pig/PigConfiguration.java src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/MRCompiler.java

Author: cheolsoo
Date: Tue Feb 18 19:24:34 2014
New Revision: 1569490

URL: http://svn.apache.org/r1569490
Log:
PIG-3648: Make the sample size for RandomSampleLoader configurable (cheolsoo)

Modified:
    pig/trunk/CHANGES.txt
    pig/trunk/conf/pig.properties
    pig/trunk/src/org/apache/pig/PigConfiguration.java
    pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/MRCompiler.java

Modified: pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1569490&r1=1569489&r2=1569490&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Tue Feb 18 19:24:34 2014
@@ -30,6 +30,8 @@ PIG-2207: Support custom counters for ag
 
 IMPROVEMENTS
 
+PIG-3648: Make the sample size for RandomSampleLoader configurable (cheolsoo)
+
 PIG-259: allow store to overwrite existing directroy (nezihyigitbasi via daijy)
 
 PIG-2672: Optimize the use of DistributedCache (aniket486)

Modified: pig/trunk/conf/pig.properties
URL: http://svn.apache.org/viewvc/pig/trunk/conf/pig.properties?rev=1569490&r1=1569489&r2=1569490&view=diff
==============================================================================
--- pig/trunk/conf/pig.properties (original)
+++ pig/trunk/conf/pig.properties Tue Feb 18 19:24:34 2014
@@ -244,6 +244,10 @@ pig.location.check.strict=false
 # jobs with smaller input data size to run in local mode
 # pig.auto.local.input.maxbytes=100000000
 
+# Set this option to overwrite the sample size of RandomeSampleLoader for
+# order-by. The default value is 100 rows per task.
+# pig.random.sampler.sample.size=100
+
 # When enabled, jobs won't create empty part files if no output is written. In this case
 # PigOutputFormat will be wrapped with org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat.
 # pig.output.lazy=true

Modified: pig/trunk/src/org/apache/pig/PigConfiguration.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/PigConfiguration.java?rev=1569490&r1=1569489&r2=1569490&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/PigConfiguration.java (original)
+++ pig/trunk/src/org/apache/pig/PigConfiguration.java Tue Feb 18 19:24:34 2014
@@ -138,6 +138,12 @@ public class PigConfiguration {
      * the distributed cache when doing fragment-replicated join
      */
     public static final String PIG_JOIN_REPLICATED_MAX_BYTES = "pig.join.replicated.max.bytes";
+ 
+    /**
+     * This key used to control the sample size of RandomeSampleLoader for
+     * order-by. The default value is 100 rows per task.
+     */
+    public static final String PIG_RANDOM_SAMPLER_SAMPLE_SIZE = "pig.random.sampler.sample.size";
 
     /**
      * This key is to turn on auto local mode feature

Modified: pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/MRCompiler.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/MRCompiler.java?rev=1569490&r1=1569489&r2=1569490&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/MRCompiler.java (original)
+++ pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/MRCompiler.java Tue Feb 18 19:24:34 2014
@@ -42,6 +42,7 @@ import org.apache.pig.FuncSpec;
 import org.apache.pig.IndexableLoadFunc;
 import org.apache.pig.LoadFunc;
 import org.apache.pig.OrderedLoadFunc;
+import org.apache.pig.PigConfiguration;
 import org.apache.pig.PigException;
 import org.apache.pig.PigWarning;
 import org.apache.pig.backend.executionengine.ExecException;
@@ -2346,8 +2347,9 @@ public class MRCompiler extends PhyPlanV
         // as its first constructor argument.
         
         rslargs[0] = (new FuncSpec(Utils.getTmpFileCompressorName(pigContext))).toString();
-        
-        rslargs[1] = "100"; // The value is calculated based on the file size for skewed join
+        // This value is only used by order by. For skewed join, it's calculated
+        // based on the file size.
+        rslargs[1] = pigContext.getProperties().getProperty(PigConfiguration.PIG_RANDOM_SAMPLER_SAMPLE_SIZE, "100");
         FileSpec quantLdFilName = new FileSpec(lFile.getFileName(),
         		new FuncSpec(sampleLdrClassName, rslargs));