You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by ch...@apache.org on 2014/05/09 06:14:06 UTC
svn commit: r1593478 - in /pig/trunk: CHANGES.txt conf/pig.properties

Author: cheolsoo
Date: Fri May  9 04:14:06 2014
New Revision: 1593478

URL: http://svn.apache.org/r1593478
Log:
 PIG-3901: Organize the Pig properties file and document all properties (mrflip via cheolsoo)

Modified:
    pig/trunk/CHANGES.txt
    pig/trunk/conf/pig.properties

Modified: pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1593478&r1=1593477&r2=1593478&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Fri May  9 04:14:06 2014
@@ -32,6 +32,8 @@ PIG-2207: Support custom counters for ag
 
 IMPROVEMENTS
 
+PIG-3901: Organize the Pig properties file and document all properties (mrflip via cheolsoo)
+
 PIG-3867: Added hadoop home to build classpath for build pig with unit test on windows (Sergey Svinarchuk via gates)
 
 PIG-3914: Change TaskContext to abstract class (cheolsoo)

Modified: pig/trunk/conf/pig.properties
URL: http://svn.apache.org/viewvc/pig/trunk/conf/pig.properties?rev=1593478&r1=1593477&r2=1593478&view=diff
==============================================================================
--- pig/trunk/conf/pig.properties (original)
+++ pig/trunk/conf/pig.properties Fri May  9 04:14:06 2014
@@ -15,262 +15,537 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# Pig configuration file. All values can be overwritten by command line arguments.
+# Pig configuration file. All values can be overwritten by command line
+# arguments; for a description of the properties, run
+#
+#     pig -h properties
+#
+
+############################################################################
+#
+# == Logging properties
+#
 
-# Use the "-h properties" command to see description of the properties
+# Location of pig log file. If blank, a file with a timestamped slug
+# ('pig_1399336559369.log') will be generated in the current working directory.
+#
+# pig.logfile=
+# pig.logfile=/tmp/pig-err.log
 
-# log4jconf log4j configuration file
+# Log4j configuration file. Set at runtime with the -4 parameter. The source
+# distribution has a ./conf/log4j.properties.template file you can rename and
+# customize.
+#
 # log4jconf=./conf/log4j.properties
 
-# a file that contains pig script
-#file=
+# Verbose Output.
+# * false (default): print only INFO and above to screen
+# * true: Print all log messages to screen
+#
+# verbose=false
 
-# load jarfile, colon separated
-#jar=
+# Omit timestamps on log messages. (default: false)
+#
+# brief=false
 
-#verbose print all log messages to screen (default to print only INFO and above to screen)
-#verbose=true
+# Logging level. debug=OFF|ERROR|WARN|INFO|DEBUG (default: INFO)
+#
+# debug=INFO
 
-#exectype local|mapreduce, mapreduce is default
-#exectype=local
-
-#the default timezone: if it is not set, the default timezone for this host is used.
-#the correct timezone format is the UTC offset: e.g., +08:00. 
-#pig.datetime.default.tz=
-
-#pig.logfile=
-
-#Do not spill temp files smaller than this size (bytes)
-#pig.spill.size.threshold=5000000
-
-#EXPERIMENT: Activate garbage collection when spilling a file bigger than this size (bytes)
-#This should help reduce the number of files being spilled.
-#pig.spill.gc.activation.size=40000000
-
-#the following two parameters are to help estimate the reducer number
-#pig.exec.reducers.bytes.per.reducer=1000000000
-#pig.exec.reducers.max=999
-
-#Logging properties
-#verbose=false
-#brief=false
-#debug=INFO
-#aggregate.warning=true
-
-#Performance tuning properties
-#pig.cachedbag.memusage=0.2
-#pig.skewedjoin.reduce.memusage=0.3
-#pig.exec.nocombiner=false
-#opt.multiquery=true
-#opt.fetch=true
-
-#Following parameters are for configuring intermediate storage format
-#Supported storage types are seqfile and tfile
-#Supported codec types: tfile supports gz(gzip) and lzo, seqfile support gz(gzip), lzo, snappy, bzip2
-#pig.tmpfilecompression=false
-#pig.tmpfilecompression.storage=seqfile
-#pig.tmpfilecompression.codec=gz
-
-#pig.noSplitCombination=true
-
-#pig.exec.mapPartAgg=false
-#pig.exec.mapPartAgg.minReduction=10
-
-#exectype=mapreduce
-#pig.additional.jars=<comma seperated list of jars>
-#udf.import.list=<comma seperated list of imports>
-#stop.on.failure=false
-
-#Use this option only when your Pig job will otherwise die because of
-#using more counters than hadoop configured limit
-#pig.disable.counter=true
+# Roll up warnings across tasks, so that when millions of mappers suddenly cry
+# out in error they are partially silenced. (default, recommended: true)
+#
+# aggregate.warning=true
 
-# By default, pig will allow 1GB of data to be replicated using
-# the distributed cache when doing fragment-replicated join.
-# pig.join.replicated.max.bytes=1000000000
+# Should DESCRIBE pretty-print its schema?
+# * false (default): print on a single-line, suitable for pasting back in to your script
+# * true (recommended): prints on multiple lines with indentation, much more readable
+#
+# pig.pretty.print.schema=false
 
-# Use this option to turn on UDF timers. This will cause two
-# counters to be tracked for every UDF and LoadFunc in your script:
-# approx_microsecs measures approximate time spent inside a UDF
-# approx_invocations reports the approximate number of times the UDF was invoked
+# Turn on UDF timers? This will cause two counters to be
+# tracked for every UDF and LoadFunc in your script: approx_microsecs measures
+# approximate time spent inside a UDF approx_invocations reports the approximate
+# number of times the UDF was invoked.
+#
+# * false (default): do not record timing information of UDFs.
+# * true: report UDF performance. Uses more counters, but gives more insight
+#   into script operation
+#
 # pig.udf.profile=false
 
-#When enabled, 'describe' prints a multi-line formatted schema
-#(similar to an indended json) rather than on a single line.
-#pig.pretty.print.schema=true
+############################################################################
+#
+# == Site-specific Properties
+#
 
-#pig.sql.type=hcat
-hcat.bin=/usr/local/hcat/bin/hcat
+# Execution Mode. Local mode is much faster, but only suitable for small amounts
+# of data. Local mode interprets paths on the local file system; Mapreduce mode
+# on the HDFS. Read more under 'Execution Modes' within the Getting Started
+# documentation.
+#
+# * mapreduce (default): use the Hadoop cluster defined in your Hadoop config files
+# * local: use local mode
+#
+# exectype=mapreduce
 
-############################ SchemaTuple ############################
+# Bootstrap file with default statements to execute in every Pig job, similar to
+# .bashrc.  If blank, uses the file '.pigbootup' from your home directory; If a
+# value is supplied, that file is NOT loaded.  This does not do tilde expansion
+# -- you must supply the full path to the file.
+#
+# pig.load.default.statements=
+# pig.load.default.statements=/home/bob/.pigrc
 
-# Setting this value will turn on the SchemaTuple feature (PIG-2632)
-# This will attempt to use code generation for more efficient within
-# the pig code. This can lead to both CPU, serialization, and memory
-# benefits (currently, the potential memory benefits are the largest).
+# Kill all waiting/running MR jobs upon a MR job failure? (default: false) If
+# false, jobs that can proceed independently will do so unless a parent stage
+# fails. If true, the failure of any stage in the script kills all jobs.
+#
+# stop.on.failure=false
 
-# This parameter will enable the optimization in all available cases
-#pig.schematuple=true
+# File containing the pig script to run. Rarely set in the properties file.
+# Commandline: -f
+#
+# file=
 
-# Certain cases can be turned off by uncommenting the following. These will
-# all be off by default, but will all be turned on if pig.schematuple is set
-# to true.
+# Jarfile to load, colon separated. Rarely used.
+#
+# jar=
 
-# This will disable SchemaTuples in the case of udfs. Currently,
-# the input to UDF's will be SchemaTuples.
+# Register additional .jar files to use with your Pig script.
+# Most typically used as a command line option (see http://pig.apache.org/docs/r0.12.0/basic.html#register):
+#
+#     pig -Dpig.additional.jars=hdfs://nn.mydomain.com:9020/myjars/my.jar
+#
+# pig.additional.jars=<colon separated list of jars with optional wildcards>
+# pig.additional.jars=/usr/local/share/pig/pig/contrib/piggybank/java/piggybank.jar:/usr/local/share/pig/datafu/datafu-pig/build/libs/datafu-pig-1.2.1.jar
 
-#pig.schematuple.udf=false
+# Specify potential packages to which a UDF or a group of UDFs belong,
+# eliminating the need to qualify the UDF on every call. See
+# http://pig.apache.org/docs/r0.12.0/udf.html#use-short-names
+#
+# Commandline use:
+#
+#     pig \
+#       -Dpig.additional.jars=$PIG_HOME/contrib/piggybank/java/piggybank.jar:$PIG_HOME/../datafu/datafu-pig/build/libs/datafu-pig-1.2.1.jar \
+#       -Dudf.import.list=org.apache.pig.piggybank.evaluation:datafu.pig.util \
+#       happy_job.pig
+#
+# udf.import.list=<colon separated list of imports>
+# udf.import.list=org.apache.pig.piggybank.evaluation:datafu.pig.bags:datafu.pig.hash:datafu.pig.stats:datafu.pig.util
 
-# This is currently not implemented. In the future, LoadFunc's with known
-# schema's should output SchemaTuples
+#
+# Reuse jars across jobs run by the same user? (default: false) If enabled, jars
+# are placed in ${pig.user.cache.location}/${user.name}/.pigcache. Since most
+# jars change infrequently, this gives a minor speedup.
+#
+# pig.user.cache.enabled=false
 
-#pig.schematuple.load=false
+# Base path for storing jars cached by the pig.user.cache.enabled feature. (default: /tmp)
+#
+# pig.user.cache.location=/tmp
 
-# This will use SchemaTuples in replicated joins. The potential memory saving
-# here is significant. It will use SchemaTuples when it builds the HashMap of
-# the join key to related values.
+# Default UTC offset. (default: the host's current UTC offset) Supply a UTC
+# offset in Java's timezone format: e.g., +08:00.
+#
+# pig.datetime.default.tz=
 
-#pig.schematuple.fr_join=false
+############################################################################
+#
+# Memory impacting properties
+#
 
-# In the current implementation of merge join, all of the Tuples in the left relation
-# that share a given key will be stored in a List in memory. This will use SchemaTuples
-# instead in that List.
+# Amount of memory (as fraction of heap) allocated to bags before a spill is
+# forced. Default is 0.2, meaning 20% of available memory. Note that this memory
+# is shared across all large bags used by the application. See
+# http://pig.apache.org/docs/r0.12.0/perf.html#memory-management
+#
+# pig.cachedbag.memusage=0.2
 
-#pig.schematuple.merge_join=false
+# Don't spill bags smaller than this size (bytes). Default: 5000000, or about
+# 5MB. Usually, the more spilling the longer runtime, so you might want to tune
+# it according to heap size of each task and so forth.
+#
+# pig.spill.size.threshold=5000000
 
-#####################################################################
+# EXPERIMENTAL: If a file bigger than this size (bytes) is spilled -- thus
+# freeing a bunch of ram -- tell the JVM to perform garbage collection.  This
+# should help reduce the number of files being spilled, but causes more-frequent
+# garbage collection. Default: 40000000 (about 40 MB)
+#
+# pig.spill.gc.activation.size=40000000
 
-##### Set up optional Pig Progress Notification Listener ############
+# Maximum amount of data to replicate using the distributed cache when doing
+# fragment-replicated join. (default: 1000000000, about 1GB) Consider increasing
+# this in a production environment, but carefully.
+#
+# pig.join.replicated.max.bytes=1000000000
 
-# Note that only one PPNL can be set up. If you need several, write a PPNL that will chain them.
-# pig.notification.listener = <fully qualified class name of a PPNL implementation>
+# Fraction of heap available for the reducer to perform a skewed join. A low
+# fraction forces Pig to use more reducers, but increases the copying cost. See
+# http://pig.apache.org/docs/r0.12.0/perf.html#skewed-joins
+#
+# pig.skewedjoin.reduce.memusage=0.3
 
-# Optionally, you can supply a single String argument to pass to your PPNL. 
-# pig.notification.listener.arg = <somevalue>
+#
+# === SchemaTuple ===
+#
+# The SchemaTuple feature (PIG-2632) uses a tuple's schema (when known) to
+# generate a custom Java class to hold records. Otherwise, tuples are loaded as
+# a plain list that is unaware of its contents' schema -- and so each element
+# has to be wrapped as a Java object on its own. This can provide more efficient
+# CPU utilization, serialization, and most of all memory usage.
+#
+# This feature is considered experimental and is off by default. You can
+# selectively enable it for specific operations using pig.schematuple.udf,
+# pig.schematuple.load, pig.schematuple.fr_join and pig.schematuple.merge_join
+#
 
-#####################################################################
+# Enable the SchemaTuple optimization in all available cases? (default: false; recommended: true)
+#
+# pig.schematuple=false
+
+# EXPERIMENTAL: Use SchemaTuples with UDFs (default: value of pig.schematuple).
+# pig.schematuple.udf=false
+
+# EXPERIMENTAL, CURRENTLY NOT IMPLEMENTED, but in the future, LoadFunc's with
+# known schemas should output SchemaTuples. (default: value of pig.schematuple)
+# pig.schematuple.load=false
+
+# EXPERIMENTAL: Use SchemaTuples in replicated joins. The potential memory
+# saving here is significant. (default: value of pig.schematuple)
+# pig.schematuple.fr_join=false
+
+# EXPERIMENTAL: Use SchemaTuples in merge joins. (default: value of pig.schematuple).
+# pig.schematuple.merge_join=false
+
+############################################################################
+#
+# Serialization options
+#
+
+# Omit empty part files from the output? (default: false)
+#
+# * false (default): reducers generates an output file, even if output is empty
+# * true (recommended): do not generate zero-byte part files
+#
+# The default behavior of MapReduce is to generate an empty file for no data, so
+# Pig follows that. But many small files can cause annoying extra map tasks and
+# put load on the HDFS, so consider setting this to 'true'
+#
+# pig.output.lazy=false
+
+#
+# === Tempfile Handling
+#
+
+# EXPERIMENTAL: Storage format for temporary files generated by intermediate
+# stages of Pig jobs. This can provide significant speed increases for certain
+# codecs, as reducing the amount of data transferred to and from disk can more
+# than make up for the cost of compression/compression. Recommend that you set
+# up LZO compression in Hadoop and specify tfile storage.
+#
+# Compress temporary files?
+# * false (default): do not compress
+# * true (recommended): compress temporary files.
+#
+# pig.tmpfilecompression=false
+# pig.tmpfilecompression=true
+
+# Tempfile storage container type.
+#
+# * tfile (default, recommended): more efficient, but only supports supports gz(gzip) and lzo compression.
+#   https://issues.apache.org/jira/secure/attachment/12396286/TFile%20Specification%2020081217.pdf
+# * seqfile: only supports gz(gzip), lzo, snappy, and bzip2 compression
+#
+# pig.tmpfilecompression.storage=tfile
+
+# Codec types for intermediate job files. tfile supports gz(gzip) and lzo;
+# seqfile support gz(gzip), lzo, snappy, bzip2
+#
+# * lzo (recommended with caveats): moderate compression, low cpu burden;
+#   typically leads to a noticeable speedup. Best default choice, but you must
+#   set up LZO independently due to license incompatibility
+# * snappy: moderate compression, low cpu burden; typically leads to a noticeable speedup..
+# * gz (default): higher compression, high CPU burden. Typically leads to a noticeable slowdown.
+# * bzip2: most compression, major CPU burden. Typically leads to a noticeable slowdown.
+#
+# pig.tmpfilecompression.codec=gzip
+
+#
+# === Split Combining
+#
+
+#
+# Should pig try to combine small files for fewer map tasks? This improves the
+# efficiency of jobs with many small input files, reduces the overhead on the
+# jobtracker, and reduces the number of output files a map-only job
+# produces. However, it only works with certain loaders and increases non-local
+# map tasks. See http://pig.apache.org/docs/r0.12.0/perf.html#combine-files
+#
+# * false (default, recommended): _do_ combine files
+# * true: do not combine files
+#
+# pig.noSplitCombination=false
+
+#
+# Size, in bytes, of data to be processed by a single map. Smaller files are
+# combined untill this size is reached. If unset, defaults to the file system's
+# default block size.
+#
+# pig.maxCombinedSplitSize=
+
+# ###########################################################################
+#
+# Execution options
+#
+
+# Should pig omit combiners? (default, recommended: false -- meaning pig _will_
+# use combiners)
+#
+# When combiners work well, they eliminate a significant amount of
+# data. However, if they do not eliminate much data -- say, a DISTINCT operation
+# that only eliminates 5% of the records -- they add a noticeable overhead to
+# the job. So the recommended default is false (use combiners), selectively
+# disabling them per-job:
+#
+#     pig -Dpig.exec.nocombiner=true distinct_but_not_too_much.pig
+#
+# pig.exec.nocombiner=false
 
-########## Override the default Reducer Estimator logic #############
+# EXPERIMENTAL: Aggregate records in map task before sending to the combiner?
+# (default: false, 10; recommended: true, 10). In cases where there is a massive
+# reduction of data in the aggregation step, pig can do a first pass of
+# aggregation before the data even leaves the mapper, saving much serialization
+# overhead. It's off by default but can give a major improvement to
+# group-and-aggregate operations. Pig skips partial aggregation unless reduction
+# is better than a factor of minReduction (default: 10). See
+# http://pig.apache.org/docs/r0.12.0/perf.html#hash-based-aggregation
+#
+# pig.exec.mapPartAgg=false
+# pig.exec.mapPartAgg.minReduction=10
+
+#
+# === Control how many reducers are used.
+#
+
+# Estimate number of reducers naively using a fixed amount of data per
+# reducer. Optimally, you have both fewer reducers than available reduce slots,
+# and reducers that are neither getting too little data (less than a half-GB or
+# so) nor too much data (more than 2-3 times the reducer child process max heap
+# size). The default of 1000000000 (about 1GB) is probably low for a production
+# cluster -- however it's much worse to set this too high (reducers spill many
+# times over in group-sort) than too low (delay waiting for reduce slots).
+#
+# pig.exec.reducers.bytes.per.reducer=1000000000
 
-# By default, the logic to estimate the number of reducers to use for a given job lives in:
-#   org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.InputSizeReducerEstimator
-# This logic can be replaced by implementing the following interface:
-#   org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigReducerEstimator
+#
+# Don't ever use more than this many reducers. (default: 999)
+#
+# pig.exec.reducers.max=999
 
-# This class will be invoked to estimate the number of reducers to use.
-# pig.exec.reducer.estimator = <fully qualified class name of a PigReducerEstimator implementation>
+#
+# === Local mode for small jobs
+#
 
-# Optionally, you can supply a single String argument to pass to your PigReducerEstimator.
-# pig.exec.reducer.estimator.arg = <somevalue>
+# EXPERIMENTAL: Use local mode for small jobs? If true, jobs with input data
+# size smaller than pig.auto.local.input.maxbytes bytes and one or no reducers
+# are run in local mode, which is much faster. Note that file paths are still
+# interpreted as pig.exectype implies.
+#
+# * true (recommended): allow local mode for small jobs, which is much faster.
+# * false (default): always use pig.exectype.
+#
+# pig.auto.local.enabled=false
+
+#
+# Definition of a small job for the pig.auto.local.enabled feature. Only jobs
+# with less than this may bytes are candidates to run locally (default:
+# 100000000 bytes, about 1GB)
+#
+# pig.auto.local.input.maxbytes=100000000
+
+############################################################################
+#
+# Security Features
+#
+
+# Comma-delimited list of commands/operators that are disallowed. This security
+# feature can be used by administrators to block use of certain commands by
+# users.
+#
+# * <blank> (default): all commands and operators are allowed.
+# * fs,set (for example): block all filesystem commands and config changes from pig scripts.
+#
+# pig.blacklist=
+# pig.blacklist=fs,set
+
+# Comma-delimited list of the only commands/operators that are allowed. This
+# security feature can be used by administrators to block use of certain
+# commands by users.
+#
+# * <blank> (default): all commands and operators not on the pig.blacklist are allowed.
+# * load,store,filter,group: only LOAD, STORE, FILTER, GROUP
+#   from pig scripts. All other commands and operators will fail.
+#
+# pig.whitelist=
+# pig.whitelist=load,store,filter,group
 
 #####################################################################
+#
+# Advanced Site-specific Customizations
+#
+
+# Remove intermediate output files?
+#
+# * true (default, recommended): remove the files
+# * false: do NOT remove the files. You must clean them up yourself.
+#
+# Keeping them is useful for advanced debugging, but can be dangerous -- you
+# must clean them up yourself.  Inspect the intermediate outputs with
+#
+#     LOAD '/path/to/tmp/file' USING org.apache.pig.impl.io.TFileStorage();
+#
+# (Or ...SequenceFileInterStorage if pig.tmpfilecompression.storage is seqfile)
+#
+# pig.delete.temp.files=true
 
-###### Override the default Pig Stats Output Size Reader logic ######
+# EXPERIMENTAL: A Pig Progress Notification Listener (PPNL) lets you wire pig's
+# progress into your visibility stack. To use a PPNL, supply the fully qualified
+# class name of a PPNL implementation. Note that only one PPNL can be set up, so
+# if you need several, write a PPNL that will chain them.
+#
+# See https://github.com/twitter/ambrose for a pretty awesome one of these
+#
+# pig.notification.listener=<fully qualified class name of a PPNL implementation>
 
-# By default, the size of reducers output is computed as the total size of
-# output files. But since not every storage is file-based, this logic is not
-# always applicable. If that is the case, the logic can be replaced by
-# implementing the following interface:
-#   org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigStatsOutputSizeReader
+# String argument to pass to your PPNL constructor (optional). Only a single
+# string value is allowed. (default none)
+#
+# pig.notification.listener.arg=<somevalue>
 
-# This class will be invoked to compute the size of reducers output.
-# pig.stats.output.size.reader = <fully qualified class name of a PigStatsOutputSizeReader implementation>
+# EXPERIMENTAL: Class invoked to estimate the number of reducers to use.
+# (default: org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.InputSizeReducerEstimator)
+#
+# If you don't know how or why to write a PigReducerEstimator, you're unlikely
+# to use this. By default, the naive mapReduceLayer.InputSizeReducerEstimator is
+# used, but you can specify anything implementing the interface
+# org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigReducerEstimator
+#
+# pig.exec.reducer.estimator=<fully qualified class name of a PigReducerEstimator implementation>
+
+# Optional String argument to pass to your PigReducerEstimator. (default: none;
+# a single String argument is allowed).
+#
+# pig.exec.reducer.estimator.arg=<somevalue>
 
+# Class invoked to report the size of reducers output. By default, the reducers'
+# output is computed as the total size of output files. But not every storage is
+# file-based, and so this logic can be replaced by implementing the interface
+# org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigStatsOutputSizeReader
 # If you need to register more than one reader, you can register them as a comma
 # separated list. Every reader implements a boolean supports(POStore sto) method.
 # When there are more than one reader, they are consulted in order, and the
 # first one whose supports() method returns true will be used.
 #
-#####################################################################
-
-#pig.load.default.statements=
-
-#####################################################################
-
-########### Override hadoop configs programatically #################
+# pig.stats.output.size.reader=<fully qualified class name of a PigStatsOutputSizeReader implementation>
 
+#
+# Override hadoop configs programatically
+#
 # By default, Pig expects hadoop configs (hadoop-site.xml and core-site.xml)
-# to be present on the classpath. There are cases when these configs are 
+# to be present on the classpath. There are cases when these configs are
 # needed to be passed programatically, such as while using the PigServer API.
 # In such cases, you can override hadoop configs by setting the property
-# "pig.use.overriden.hadoop.configs". 
-# 
+# "pig.use.overriden.hadoop.configs".
+#
 # When this property is set to true, Pig ignores looking for hadoop configs
 # in the classpath and instead picks it up from Properties/Configuration
 # object passed to it.
-
-# pig.use.overriden.hadoop.configs=false
 #
-######################################################################
-
-# Check if the script needs to check multiple stores writing
-# to the same location. When set to true, stops the execution
-# of script right away.
-pig.location.check.strict=false
-
-######################################################################
-
-# This key is used to define the default load func. Pig will fallback 
-# on PigStorage as default in case this is undefined.
+# pig.use.overriden.hadoop.configs=false
 
+# Implied LoadFunc for the LOAD operation when no USING clause is
+# present. Supply the fully qualified class name of a LoadFunc
+# implementation. Note: setting this means you will have to modify most code
+# brought in from elsewhere on the web, as people generally omit the USING
+# clause for TSV files.
+#
+# * org.apache.pig.builtin.PigStorage (default): the traditional tab-separated-values LoadFunc
+# * my.custom.udfcollection.MyCustomLoadFunc (for example): use MyCustomLoadFunc instead
+#
 # pig.default.load.func=<fully qualified class name of a LoadFunc implementation>
-# For eg, pig.default.load.func=org.apache.pig.custom.MyCustomStorage
-
-# This key is used to define the default store func. Pig will fallback 
-# on PigStorage as default in case this is undefined.
 
+# The implied StoreFunc for STORE operations with no USING clause. Supply the
+# fully qualified class name of a StoreFunc implementation.
+#
+# * org.apache.pig.builtin.PigStorage (default): the traditional tab-separated-values StoreFunc.
+# * my.custom.udfcollection.MyCustomStoreFunc (for example): use MyCustomStoreFunc instead
+#
 # pig.default.store.func=<fully qualified class name of a StoreFunc implementation>
-# For eg, pig.default.store.func=org.apache.pig.custom.MyCustomStorage
 
-# This option is used to define whether to support recovery to handle the
-# application master getting restarted.
+# Recover jobs when the application master is restarted? (default: false). This
+# is a Hadoop 2 specific property; enable it to take advantage of AM recovery.
+#
 # pig.output.committer.recovery.support=true
 
-# By default, the size of pig script stored in job xml is limited to 10,240
-# characters. This property can be used to configure it.
-# pig.script.max.size=<somevalue>
-
-# Set this option to false to keep intermediate outputs files for debuggin
-# purpose. By default, it is set to true.
-# To inspect, use a = load '<path_to_tmp_file>' using org.apache.pig.impl.io.TFileStorage();
-# pig.delete.temp.files=true
+# Should scripts check to prevent multiple stores writing to the same location?
+# (default: false) When set to true, stops the execution of script right away.
+#
+pig.location.check.strict=false
 
-# Set this option to true to convert jobs with input data size smaller than
-# pig.auto.local.input.maxbytes bytes and number of reducers <=1 to run in local mode
-# By default, this is set to false.
-# pig.auto.local.enabled=true
+# In addition to the fs-style commands (rm, ls, etc) Pig can now execute
+# SQL-style DDL commands, eg "sql create table pig_test(name string, age int)".
+# The only implemented backend is hcat, and luckily that's also the default.
+#
+# pig.sql.type=hcat
 
-# Set value in long as a threshold number of bytes to convert
-# jobs with smaller input data size to run in local mode
-# pig.auto.local.input.maxbytes=100000000
+# Path to the hcat executable, for use with pig.sql.type=hcat (default: null)
+#
+hcat.bin=/usr/local/hcat/bin/hcat
 
-# Set this option to overwrite the sample size of RandomeSampleLoader for
-# order-by. The default value is 100 rows per task.
-# pig.random.sampler.sample.size=100
+###########################################################################
+#
+# Overrides for extreme environments
+#
+# (Most people won't have to adjust these parameters)
+#
 
-# When enabled, jobs won't create empty part files if no output is written. In this case
-# PigOutputFormat will be wrapped with org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat.
-# pig.output.lazy=true
-
-# Set this option to turn on additional jar caching for the user
-# pig.user.cache.enabled=true
-
-# This option defines location where additional jars are cached for the user.
-# Additional jar will be cached under PIG_USER_CACHE_LOCATION/${user.name}/.pigcache
-# and will be re-used across the jobs run by the user if the jar has not changed.
-# pig.user.cache.location=/tmp
 
-####################################################################################
+# Limit the pig script length placed in the jobconf xml. (default:10240)
+# Extremely long queries can waste space in the JobConf; since its contents are
+# only advisory, the default is fine unless you are retaining it for forensics.
+#
+# pig.script.max.size=10240
 
-# Comma-delimited entries of commands/operators that must be disallowed. This is a
-# security feature to be used by administrators to block use of commands by users.
-# For eg, an admin might like to block all filesystem commands and setting configs
-# in pig script. In which case, the entry would be
-# pig.blacklist=fs,set
+# Disable use of counters by Pig. Note that the word 'counter' is singular here.
+#
+# * false (default, recommended): do NOT disable counters.
+# * true: disable counters. Set this to true only when your Pig job will
+#   otherwise die because of using more counters than hadoop configured limit
+#
+# pig.disable.counter=true
 
-# Comma-delimited entries of commands/operators that must be allowed. This is a
-# security feature to be used by administrators to block use of commands by users
-# that are not a part of the whitelist.
-# For eg, an admin might like to allow only LOAD, STORE, FILTER, GROUP
-# in pig script. In which case, the entry would be
-# pig.whitelist=load,store,filter,group
+# Sample size (per-mapper, in number of rows) the ORDER..BY operation's
+# RandomSampleLoader uses to estimate how your data should be
+# partitioned. (default, recommended: 100 rows per task) Increase this if you
+# have exceptionally large input splits and are unhappy with the reducer skew.
+#
+# pig.random.sampler.sample.size=100
+
+# Process an entire script at once, reducing the amount of work and number of
+# tasks? (default, recommended: true) See http://pig.apache.org/docs/r0.12.0/perf.html#multi-query-execution
+#
+# MultiQuery optimization is very useful, and so the recommended default is
+# true. You may find a that a script fails to compile under MultiQuery. If so,
+# disable it at runtime:
+#
+#     pig -no_multiquery script_that_makes_pig_sad.pig
+#
+# opt.multiquery=true
+
+# For small queries, fetch data directly from the HDFS. (default, recommended:
+# true). If you want to force Pig to launch a MR job, for example when you're
+# testing a live cluster, disable with the -N option. See PIG-3642.
+#
+# opt.fetch=true