You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by rh...@apache.org on 2014/01/29 19:58:19 UTC
svn commit: r1562547 - in /hive/trunk:
common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
conf/hive-default.xml.template
ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
Author: rhbutani
Date: Wed Jan 29 18:58:19 2014
New Revision: 1562547
URL: http://svn.apache.org/r1562547
Log:
HIVE-6300 Add documentation for stats configs to hive-default.xml.template (Prasanth J via Harish Butani, Lefty Leverenz)
Modified:
hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
hive/trunk/conf/hive-default.xml.template
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
URL: http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1562547&r1=1562546&r2=1562547&view=diff
==============================================================================
--- hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (original)
+++ hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Wed Jan 29 18:58:19 2014
@@ -652,14 +652,11 @@ public class HiveConf extends Configurat
HIVE_STATS_MAP_NUM_ENTRIES("hive.stats.map.num.entries", 10),
// to accurately compute statistics for GROUPBY map side parallelism needs to be known
HIVE_STATS_MAP_SIDE_PARALLELISM("hive.stats.map.parallelism", 1),
- // statistics annotation fetches column statistics for all required columns and for all
- // required partitions which can be very expensive sometimes
+ // statistics annotation fetches column statistics for all required columns which can
+ // be very expensive sometimes
HIVE_STATS_FETCH_COLUMN_STATS("hive.stats.fetch.column.stats", false),
- // in the absence of table/partition stats, average row size will be used to
- // estimate the number of rows/data size
- HIVE_STATS_AVG_ROW_SIZE("hive.stats.avg.row.size", 10000),
// in the absence of column statistics, the estimated number of rows/data size that will
- // emitted from join operator will depend on t factor
+ // be emitted from join operator will depend on this factor
HIVE_STATS_JOIN_FACTOR("hive.stats.join.factor", (float) 1.1),
// in the absence of uncompressed/raw data size, total file size will be used for statistics
// annotation. But the file may be compressed, encoded and serialized which may be lesser in size
Modified: hive/trunk/conf/hive-default.xml.template
URL: http://svn.apache.org/viewvc/hive/trunk/conf/hive-default.xml.template?rev=1562547&r1=1562546&r2=1562547&view=diff
==============================================================================
--- hive/trunk/conf/hive-default.xml.template (original)
+++ hive/trunk/conf/hive-default.xml.template Wed Jan 29 18:58:19 2014
@@ -1322,6 +1322,102 @@
</property>
<property>
+ <name>hive.stats.max.variable.length</name>
+ <value>100</value>
+ <description>
+ To estimate the size of data flowing through operators in Hive/Tez(for reducer estimation etc.),
+ average row size is multiplied with the total number of rows coming out of each operator.
+ Average row size is computed from average column size of all columns in the row. In the absence
+ of column statistics, for variable length columns (like string, bytes etc.), this value will be
+ used. For fixed length columns their corresponding Java equivalent sizes are used
+ (float - 4 bytes, double - 8 bytes etc.).
+ </description>
+</property>
+
+<property>
+ <name>hive.stats.list.num.entries</name>
+ <value>10</value>
+ <description>
+ To estimate the size of data flowing through operators in Hive/Tez(for reducer estimation etc.),
+ average row size is multiplied with the total number of rows coming out of each operator.
+ Average row size is computed from average column size of all columns in the row. In the absence
+ of column statistics and for variable length complex columns like list, the average number of
+ entries/values can be specified using this config.
+ </description>
+</property>
+
+<property>
+ <name>hive.stats.map.num.entries</name>
+ <value>10</value>
+ <description>
+ To estimate the size of data flowing through operators in Hive/Tez(for reducer estimation etc.),
+ average row size is multiplied with the total number of rows coming out of each operator.
+ Average row size is computed from average column size of all columns in the row. In the absence
+ of column statistics and for variable length complex columns like map, the average number of
+ entries/values can be specified using this config.
+ </description>
+</property>
+
+<property>
+ <name>hive.stats.map.parallelism</name>
+ <value>1</value>
+ <description>
+ Hive/Tez optimizer estimates the data size flowing through each of the operators.
+ For GROUPBY operator, to accurately compute the data size map-side parallelism needs to
+ be known. By default, this value is set to 1 since optimizer is not aware of the number of
+ mappers during compile-time. This Hive config can be used to specify the number of mappers
+ to be used for data size computation of GROUPBY operator.
+ </description>
+</property>
+
+<property>
+ <name>hive.stats.fetch.column.stats</name>
+ <value>false</value>
+ <description>
+ Annotation of operator tree with statistics information requires column statisitcs.
+ Column statistics are fetched from metastore. Fetching column statistics for each needed column
+ can be expensive when the number of columns is high. This flag can be used to disable fetching
+ of column statistics from metastore.
+ </description>
+</property>
+
+<property>
+ <name>hive.stats.fetch.partition.stats</name>
+ <value>true</value>
+ <description>
+ Annotation of operator tree with statistics information requires partition level basic
+ statisitcs like number of rows, data size and file size. Partition statistics are fetched from
+ metastore. Fetching partition statistics for each needed partition can be expensive when the
+ number of partitions is high. This flag can be used to disable fetching of partition statistics
+ from metastore. When this flag is disabled, Hive will make calls to filesystem to get file sizes
+ and will estimate the number of rows from row schema.
+ </description>
+</property>
+
+<property>
+ <name>hive.stats.join.factor</name>
+ <value>1.1</value>
+ <description>
+ Hive/Tez optimizer estimates the data size flowing through each of the operators. JOIN operator
+ uses column statistics to estimate the number of rows flowing out of it and hence the data size.
+ In the absence of column statistics, this factor determines the amount of rows that flows out
+ of JOIN operator.
+ </description>
+</property>
+
+<property>
+ <name>hive.stats.deserialization.factor</name>
+ <value>1.0</value>
+ <description>
+ Hive/Tez optimizer estimates the data size flowing through each of the operators. In the absence
+ of basic statistics like number of rows and data size, file size is used to estimate the number
+ of rows and data size. Since files in tables/partitions are serialized (and optionally
+ compressed) the estimates of number of rows and data size cannot be reliably determined.
+ This factor is multiplied with the file size to account for serialization and compression.
+ </description>
+</property>
+
+<property>
<name>hive.support.concurrency</name>
<value>false</value>
<description>Whether Hive supports concurrency or not. A ZooKeeper instance must be up and running for the default Hive lock manager to support read-write locks.</description>
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java?rev=1562547&r1=1562546&r2=1562547&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java Wed Jan 29 18:58:19 2014
@@ -988,9 +988,6 @@ public class StatsRulesProcFactory {
if (limit <= parentStats.getNumRows()) {
long numRows = limit;
long avgRowSize = parentStats.getAvgRowSize();
- if (avgRowSize <= 0) {
- avgRowSize = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_AVG_ROW_SIZE);
- }
long dataSize = avgRowSize * limit;
wcStats.setNumRows(numRows);
wcStats.setDataSize(dataSize);