You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by he...@apache.org on 2011/05/20 19:30:56 UTC

svn commit: r1125478 - in /hive/trunk: common/src/java/org/apache/hadoop/hive/conf/ conf/ ql/src/java/org/apache/hadoop/hive/ql/index/ ql/src/test/queries/clientnegative/ ql/src/test/results/clientnegative/

Author: heyongqiang
Date: Fri May 20 17:30:55 2011
New Revision: 1125478

URL: http://svn.apache.org/viewvc?rev=1125478&view=rev
Log:
HIVE-2096:throw a error if the input is larger than a threshold for index input format (Wojciech Galuba via He Yongqiang)

Added:
    hive/trunk/ql/src/test/queries/clientnegative/index_compact_entry_limit.q
    hive/trunk/ql/src/test/queries/clientnegative/index_compact_size_limit.q
    hive/trunk/ql/src/test/results/clientnegative/index_compact_entry_limit.q.out
    hive/trunk/ql/src/test/results/clientnegative/index_compact_size_limit.q.out
Modified:
    hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
    hive/trunk/conf/hive-default.xml
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/index/HiveIndexResult.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/index/HiveIndexedInputFormat.java

Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
URL: http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1125478&r1=1125477&r2=1125478&view=diff
==============================================================================
--- hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (original)
+++ hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Fri May 20 17:30:55 2011
@@ -372,6 +372,8 @@ public class HiveConf extends Configurat
     // Indexes
     HIVEOPTINDEXFILTER_COMPACT_MINSIZE("hive.optimize.index.filter.compact.minsize", (long) 5 * 1024 * 1024 * 1024), // 5G
     HIVEOPTINDEXFILTER_COMPACT_MAXSIZE("hive.optimize.index.filter.compact.maxsize", (long) -1), // infinity
+    HIVE_INDEX_COMPACT_QUERY_MAX_ENTRIES("hive.index.compact.query.max.entries", (long) 10000000), // 10M
+    HIVE_INDEX_COMPACT_QUERY_MAX_SIZE("hive.index.compact.query.max.size", (long) 10 * 1024 * 1024 * 1024), // 10G
 
     // Statistics
     HIVESTATSAUTOGATHER("hive.stats.autogather", true),

Modified: hive/trunk/conf/hive-default.xml
URL: http://svn.apache.org/viewvc/hive/trunk/conf/hive-default.xml?rev=1125478&r1=1125477&r2=1125478&view=diff
==============================================================================
--- hive/trunk/conf/hive-default.xml (original)
+++ hive/trunk/conf/hive-default.xml Fri May 20 17:30:55 2011
@@ -1043,6 +1043,18 @@
 </property>
 
 <property>
+  <name>hive.index.compact.query.max.size</name>
+  <value>10737418240</value>
+  <description>The maximum number of bytes that a query using the compact index can read. Negative value is equivalent to infinity.</description>
+</property>
+
+<property>
+  <name>hive.index.compact.query.max.entries</name>
+  <value>10000000</value>
+  <description>The maximum number of index entries to read during a query that uses the compact index. Negative value is equivalent to infinity.</description>
+</property>
+
+<property>
   <name>hive.exim.uri.scheme.whitelist</name>
   <value>hdfs,pfile</value>
   <description>A comma separated list of acceptable URI schemes for import and export.</description>

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/index/HiveIndexResult.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/index/HiveIndexResult.java?rev=1125478&r1=1125477&r2=1125478&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/index/HiveIndexResult.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/index/HiveIndexResult.java Fri May 20 17:30:55 2011
@@ -88,7 +88,7 @@ public class HiveIndexResult {
 
     bytesRef[0] = new BytesRefWritable();
     bytesRef[1] = new BytesRefWritable();
-    ignoreHdfsLoc = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_INDEX_IGNORE_HDFS_LOC); 
+    ignoreHdfsLoc = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_INDEX_IGNORE_HDFS_LOC);
 
     if (indexFile != null) {
       Path indexFilePath = new Path(indexFile);
@@ -104,12 +104,22 @@ public class HiveIndexResult {
         paths.add(indexFilePath);
       }
 
+      long maxEntriesToLoad = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVE_INDEX_COMPACT_QUERY_MAX_ENTRIES);
+      if (maxEntriesToLoad < 0) {
+        maxEntriesToLoad=Long.MAX_VALUE;
+      }
+
+      long lineCounter = 0;
       for (Path indexFinalPath : paths) {
         FSDataInputStream ifile = fs.open(indexFinalPath);
         LineReader lr = new LineReader(ifile, conf);
         try {
           Text line = new Text();
           while (lr.readLine(line) > 0) {
+            if (++lineCounter > maxEntriesToLoad) {
+              throw new HiveException("Number of compact index entries loaded during the query exceeded the maximum of " + maxEntriesToLoad
+                  + " set in " + HiveConf.ConfVars.HIVE_INDEX_COMPACT_QUERY_MAX_ENTRIES.varname);
+            }
             add(line);
           }
         }
@@ -140,7 +150,7 @@ public class HiveIndexResult {
               + line.toString());
     }
     String bucketFileName = new String(bytes, 0, firstEnd);
-    
+
     if (ignoreHdfsLoc) {
       Path tmpPath = new Path(bucketFileName);
       bucketFileName = tmpPath.toUri().getPath();

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/index/HiveIndexedInputFormat.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/index/HiveIndexedInputFormat.java?rev=1125478&r1=1125477&r2=1125478&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/index/HiveIndexedInputFormat.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/index/HiveIndexedInputFormat.java Fri May 20 17:30:55 2011
@@ -26,6 +26,8 @@ import java.util.Set;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
 import org.apache.hadoop.hive.ql.exec.Utilities;
 import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
 import org.apache.hadoop.hive.ql.io.HiveInputFormat;
@@ -129,6 +131,13 @@ public class HiveIndexedInputFormat exte
 
     ArrayList<HiveInputSplit> newSplits = new ArrayList<HiveInputSplit>(
         numSplits);
+
+    long maxInputSize = HiveConf.getLongVar(job, ConfVars.HIVE_INDEX_COMPACT_QUERY_MAX_SIZE);
+    if (maxInputSize < 0) {
+      maxInputSize=Long.MAX_VALUE;
+    }
+
+    long sumSplitLengths = 0;
     for (HiveInputSplit split : splits) {
       l4j.info("split start : " + split.getStart());
       l4j.info("split end : " + (split.getStart() + split.getLength()));
@@ -140,13 +149,19 @@ public class HiveIndexedInputFormat exte
           if (split.inputFormatClassName().contains("RCFile")
               || split.inputFormatClassName().contains("SequenceFile")) {
             if (split.getStart() > SequenceFile.SYNC_INTERVAL) {
-              newSplit = new HiveInputSplit(new FileSplit(split.getPath(), split
-                  .getStart()
-                  - SequenceFile.SYNC_INTERVAL, split.getLength()
-                  + SequenceFile.SYNC_INTERVAL, split.getLocations()), split
-                  .inputFormatClassName());
+              newSplit = new HiveInputSplit(new FileSplit(split.getPath(),
+                  split.getStart() - SequenceFile.SYNC_INTERVAL,
+                  split.getLength() + SequenceFile.SYNC_INTERVAL,
+                  split.getLocations()),
+                  split.inputFormatClassName());
             }
           }
+          sumSplitLengths += newSplit.getLength();
+          if (sumSplitLengths > maxInputSize) {
+            throw new IOException(
+                "Size of data to read during a compact-index-based query exceeded the maximum of "
+                    + maxInputSize + " set in " + ConfVars.HIVE_INDEX_COMPACT_QUERY_MAX_SIZE.varname);
+          }
           newSplits.add(newSplit);
         }
       } catch (HiveException e) {
@@ -156,7 +171,7 @@ public class HiveIndexedInputFormat exte
     }
     InputSplit retA[] = newSplits.toArray((new FileSplit[newSplits.size()]));
     l4j.info("Number of input splits: " + splits.length + " new input splits: "
-        + retA.length);
+        + retA.length + ", sum of split lengths: " + sumSplitLengths);
     return retA;
   }
 }

Added: hive/trunk/ql/src/test/queries/clientnegative/index_compact_entry_limit.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientnegative/index_compact_entry_limit.q?rev=1125478&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientnegative/index_compact_entry_limit.q (added)
+++ hive/trunk/ql/src/test/queries/clientnegative/index_compact_entry_limit.q Fri May 20 17:30:55 2011
@@ -0,0 +1,11 @@
+drop index src_index on src;
+
+CREATE INDEX src_index ON TABLE src(key) as 'COMPACT' WITH DEFERRED REBUILD;
+ALTER INDEX src_index ON src REBUILD;
+
+SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+INSERT OVERWRITE DIRECTORY "/tmp/index_result" SELECT `_bucketname` ,  `_offsets` FROM default__src_src_index__ WHERE key<1000;
+SET hive.index.compact.file=/tmp/index_result;
+SET hive.input.format=org.apache.hadoop.hive.ql.index.compact.HiveCompactIndexInputFormat;
+SET hive.index.compact.query.max.entries=5;
+SELECT key, value FROM src WHERE key=100 ORDER BY key;

Added: hive/trunk/ql/src/test/queries/clientnegative/index_compact_size_limit.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientnegative/index_compact_size_limit.q?rev=1125478&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientnegative/index_compact_size_limit.q (added)
+++ hive/trunk/ql/src/test/queries/clientnegative/index_compact_size_limit.q Fri May 20 17:30:55 2011
@@ -0,0 +1,12 @@
+drop index src_index on src;
+
+CREATE INDEX src_index ON TABLE src(key) as 'COMPACT' WITH DEFERRED REBUILD;
+ALTER INDEX src_index ON src REBUILD;
+
+SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
+INSERT OVERWRITE DIRECTORY "/tmp/index_result" SELECT `_bucketname` ,  `_offsets` FROM default__src_src_index__ WHERE key<1000;
+SET hive.index.compact.file=/tmp/index_result;
+SET hive.input.format=org.apache.hadoop.hive.ql.index.compact.HiveCompactIndexInputFormat;
+SET hive.index.compact.query.max.size=1024;
+SELECT key, value FROM src WHERE key=100 ORDER BY key;
+

Added: hive/trunk/ql/src/test/results/clientnegative/index_compact_entry_limit.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientnegative/index_compact_entry_limit.q.out?rev=1125478&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientnegative/index_compact_entry_limit.q.out (added)
+++ hive/trunk/ql/src/test/results/clientnegative/index_compact_entry_limit.q.out Fri May 20 17:30:55 2011
@@ -0,0 +1,35 @@
+PREHOOK: query: drop index src_index on src
+PREHOOK: type: DROPINDEX
+POSTHOOK: query: drop index src_index on src
+POSTHOOK: type: DROPINDEX
+PREHOOK: query: CREATE INDEX src_index ON TABLE src(key) as 'COMPACT' WITH DEFERRED REBUILD
+PREHOOK: type: CREATEINDEX
+POSTHOOK: query: CREATE INDEX src_index ON TABLE src(key) as 'COMPACT' WITH DEFERRED REBUILD
+POSTHOOK: type: CREATEINDEX
+PREHOOK: query: ALTER INDEX src_index ON src REBUILD
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@default__src_src_index__
+POSTHOOK: query: ALTER INDEX src_index ON src REBUILD
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@default__src_src_index__
+POSTHOOK: Lineage: default__src_src_index__._bucketname SIMPLE [(src)src.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__._offsets EXPRESSION [(src)src.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+PREHOOK: query: INSERT OVERWRITE DIRECTORY "/tmp/index_result" SELECT `_bucketname` ,  `_offsets` FROM default__src_src_index__ WHERE key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@default__src_src_index__
+PREHOOK: Output: /tmp/index_result
+POSTHOOK: query: INSERT OVERWRITE DIRECTORY "/tmp/index_result" SELECT `_bucketname` ,  `_offsets` FROM default__src_src_index__ WHERE key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@default__src_src_index__
+POSTHOOK: Output: /tmp/index_result
+POSTHOOK: Lineage: default__src_src_index__._bucketname SIMPLE [(src)src.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__._offsets EXPRESSION [(src)src.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+PREHOOK: query: SELECT key, value FROM src WHERE key=100 ORDER BY key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: file:/tmp/wgaluba/hive_2011-05-09_20-23-26_023_6902661313242990836/-mr-10000
+FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.MapRedTask

Added: hive/trunk/ql/src/test/results/clientnegative/index_compact_size_limit.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientnegative/index_compact_size_limit.q.out?rev=1125478&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientnegative/index_compact_size_limit.q.out (added)
+++ hive/trunk/ql/src/test/results/clientnegative/index_compact_size_limit.q.out Fri May 20 17:30:55 2011
@@ -0,0 +1,35 @@
+PREHOOK: query: drop index src_index on src
+PREHOOK: type: DROPINDEX
+POSTHOOK: query: drop index src_index on src
+POSTHOOK: type: DROPINDEX
+PREHOOK: query: CREATE INDEX src_index ON TABLE src(key) as 'COMPACT' WITH DEFERRED REBUILD
+PREHOOK: type: CREATEINDEX
+POSTHOOK: query: CREATE INDEX src_index ON TABLE src(key) as 'COMPACT' WITH DEFERRED REBUILD
+POSTHOOK: type: CREATEINDEX
+PREHOOK: query: ALTER INDEX src_index ON src REBUILD
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@default__src_src_index__
+POSTHOOK: query: ALTER INDEX src_index ON src REBUILD
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@default__src_src_index__
+POSTHOOK: Lineage: default__src_src_index__._bucketname SIMPLE [(src)src.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__._offsets EXPRESSION [(src)src.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+PREHOOK: query: INSERT OVERWRITE DIRECTORY "/tmp/index_result" SELECT `_bucketname` ,  `_offsets` FROM default__src_src_index__ WHERE key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@default__src_src_index__
+PREHOOK: Output: /tmp/index_result
+POSTHOOK: query: INSERT OVERWRITE DIRECTORY "/tmp/index_result" SELECT `_bucketname` ,  `_offsets` FROM default__src_src_index__ WHERE key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@default__src_src_index__
+POSTHOOK: Output: /tmp/index_result
+POSTHOOK: Lineage: default__src_src_index__._bucketname SIMPLE [(src)src.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__._offsets EXPRESSION [(src)src.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__src_src_index__.key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+PREHOOK: query: SELECT key, value FROM src WHERE key=100 ORDER BY key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: file:/tmp/wgaluba/hive_2011-05-09_20-23-26_023_6902661313242990836/-mr-10000
+FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.MapRedTask