You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by nz...@apache.org on 2011/02/11 00:27:35 UTC

svn commit: r1069611 - in /hive/trunk: ./ ql/src/java/org/apache/hadoop/hive/ql/exec/ ql/src/java/org/apache/hadoop/hive/ql/io/ ql/src/test/org/apache/hadoop/hive/ql/io/

Author: nzhang
Date: Thu Feb 10 23:27:34 2011
New Revision: 1069611

URL: http://svn.apache.org/viewvc?rev=1069611&view=rev
Log:
HIVE-1978 Hive SymlinkTextInputFormat does not estimate input size correctly (He Yongqiang via Ning Zhang)

Added:
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/ContentSummaryInputFormat.java
Modified:
    hive/trunk/CHANGES.txt
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/SymlinkTextInputFormat.java
    hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java

Modified: hive/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hive/trunk/CHANGES.txt?rev=1069611&r1=1069610&r2=1069611&view=diff
==============================================================================
--- hive/trunk/CHANGES.txt (original)
+++ hive/trunk/CHANGES.txt Thu Feb 10 23:27:34 2011
@@ -391,6 +391,9 @@ Trunk -  Unreleased
     HIVE-1865 redo zookeeper hive lock manager
     (namit via He Yongqiang)
 
+    HIVE-1978 Hive SymlinkTextInputFormat does not estimate input size correctly
+    (He Yongqiang via Ning Zhang)
+
   OPTIMIZATIONS
 
   BUG FIXES

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java?rev=1069611&r1=1069610&r2=1069611&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java Thu Feb 10 23:27:34 2011
@@ -80,6 +80,8 @@ import org.apache.hadoop.hive.metastore.
 import org.apache.hadoop.hive.metastore.api.Order;
 import org.apache.hadoop.hive.ql.Context;
 import org.apache.hadoop.hive.ql.QueryPlan;
+import org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat;
+import org.apache.hadoop.hive.ql.io.HiveInputFormat;
 import org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat;
 import org.apache.hadoop.hive.ql.io.RCFile;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
@@ -106,6 +108,7 @@ import org.apache.hadoop.io.SequenceFile
 import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.io.compress.DefaultCodec;
 import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.InputFormat;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.SequenceFileInputFormat;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
@@ -1438,8 +1441,19 @@ public final class Utilities {
 
         ContentSummary cs = ctx.getCS(path);
         if (cs == null) {
-          FileSystem fs = p.getFileSystem(ctx.getConf());
-          cs = fs.getContentSummary(p);
+          JobConf jobConf = new JobConf(ctx.getConf());
+          PartitionDesc partDesc = work.getPathToPartitionInfo().get(
+              p.toString());
+          Class<? extends InputFormat> inputFormatCls = partDesc
+              .getInputFileFormatClass();
+          InputFormat inputFormatObj = HiveInputFormat.getInputFormatFromCache(
+              inputFormatCls, jobConf);
+          if(inputFormatObj instanceof ContentSummaryInputFormat) {
+            cs = ((ContentSummaryInputFormat) inputFormatObj).getContentSummary(p, jobConf);
+          } else {
+            FileSystem fs = p.getFileSystem(ctx.getConf());
+            cs = fs.getContentSummary(p);
+          }
           ctx.addCS(path, cs);
         }
 

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/ContentSummaryInputFormat.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/ContentSummaryInputFormat.java?rev=1069611&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/ContentSummaryInputFormat.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/ContentSummaryInputFormat.java Thu Feb 10 23:27:34 2011
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.io;
+
+import java.io.IOException;
+
+import org.apache.hadoop.fs.ContentSummary;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobConf;
+
+/**
+ * ContentSummayInputFormat provides an interface to let the input format itself
+ * figure the content summary for a give input path.
+ */
+public interface ContentSummaryInputFormat {
+
+  public ContentSummary getContentSummary(Path p, JobConf job)
+      throws IOException;
+
+}

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/SymlinkTextInputFormat.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/SymlinkTextInputFormat.java?rev=1069611&r1=1069610&r2=1069611&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/SymlinkTextInputFormat.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/SymlinkTextInputFormat.java Thu Feb 10 23:27:34 2011
@@ -9,6 +9,7 @@ import java.util.ArrayList;
 import java.util.List;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.ContentSummary;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -32,7 +33,7 @@ import org.apache.hadoop.mapred.TextInpu
  */
 @SuppressWarnings("deprecation")
 public class SymlinkTextInputFormat
-    implements InputFormat<LongWritable, Text>, JobConfigurable {
+    implements InputFormat<LongWritable, Text>, JobConfigurable, ContentSummaryInputFormat {
   /**
    * This input split wraps the FileSplit generated from
    * TextInputFormat.getSplits(), while setting the original link file path
@@ -181,4 +182,31 @@ public class SymlinkTextInputFormat
   public void validateInput(JobConf job) throws IOException {
     // do nothing
   }
+
+  @Override
+  public ContentSummary getContentSummary(Path p, JobConf job)
+      throws IOException {
+    //length, file count, directory count
+    long[] summary = {0, 0, 0};
+    List<Path> targetPaths = new ArrayList<Path>();
+    List<Path> symlinkPaths = new ArrayList<Path>();
+    try {
+      getTargetPathsFromSymlinksDirs(
+          job,
+          new Path[]{p},
+          targetPaths,
+          symlinkPaths);
+    } catch (Exception e) {
+      throw new IOException(
+          "Error parsing symlinks from specified job input path.", e);
+    }
+    for(Path path : targetPaths) {
+      FileSystem fs = path.getFileSystem(job);
+      ContentSummary cs = fs.getContentSummary(path);
+      summary[0] += cs.getLength();
+      summary[1] += cs.getFileCount();
+      summary[2] += cs.getDirectoryCount();
+    }
+    return new ContentSummary(summary[0], summary[1], summary[2]);
+  }
 }

Modified: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java?rev=1069611&r1=1069610&r2=1069611&view=diff
==============================================================================
--- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java (original)
+++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java Thu Feb 10 23:27:34 2011
@@ -10,6 +10,7 @@ import junit.framework.TestCase;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.ContentSummary;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.LongWritable;
@@ -64,31 +65,53 @@ public class TestSymlinkTextInputFormat 
    */
   public void testAccuracy1() throws IOException {
     // First data dir, contains 2 files.
-    writeTextFile(new Path(dataDir1, "file1"),
+    
+    FileSystem fs = dataDir1.getFileSystem(job);
+    int symbolLinkedFileSize = 0;
+    
+    Path dir1_file1 = new Path(dataDir1, "file1");
+    writeTextFile(dir1_file1,
                   "dir1_file1_line1\n" +
                   "dir1_file1_line2\n");
-    writeTextFile(new Path(dataDir1, "file2"),
+    
+    symbolLinkedFileSize += fs.getFileStatus(dir1_file1).getLen();
+    
+    Path dir1_file2 = new Path(dataDir1, "file2");
+    writeTextFile(dir1_file2,
                   "dir1_file2_line1\n" +
                   "dir1_file2_line2\n");
-
+    
     // Second data dir, contains 2 files.
-    writeTextFile(new Path(dataDir2, "file1"),
+    
+    Path dir2_file1 = new Path(dataDir2, "file1");
+    writeTextFile(dir2_file1,
                   "dir2_file1_line1\n" +
                   "dir2_file1_line2\n");
-    writeTextFile(new Path(dataDir2, "file2"),
+    
+    Path dir2_file2 = new Path(dataDir2, "file2");
+    writeTextFile(dir2_file2,
                   "dir2_file2_line1\n" +
                   "dir2_file2_line2\n");
 
+    symbolLinkedFileSize += fs.getFileStatus(dir2_file2).getLen();
+    
     // A symlink file, contains first file from first dir and second file from
     // second dir.
     writeSymlinkFile(
         new Path(symlinkDir, "symlink_file"),
         new Path(dataDir1, "file1"),
         new Path(dataDir2, "file2"));
+    
+    SymlinkTextInputFormat inputFormat = new SymlinkTextInputFormat();
+    
+    //test content summary
+    ContentSummary cs = inputFormat.getContentSummary(symlinkDir, job);
+    
+    assertEquals(symbolLinkedFileSize, cs.getLength());
+    assertEquals(2, cs.getFileCount());
+    assertEquals(0, cs.getDirectoryCount());
 
     FileInputFormat.setInputPaths(job, symlinkDir);
-
-    SymlinkTextInputFormat inputFormat = new SymlinkTextInputFormat();
     InputSplit[] splits = inputFormat.getSplits(job, 2);
 
     log.info("Number of splits: " + splits.length);
@@ -126,6 +149,13 @@ public class TestSymlinkTextInputFormat 
     FileInputFormat.setInputPaths(job, symlinkDir);
 
     SymlinkTextInputFormat inputFormat = new SymlinkTextInputFormat();
+    
+    ContentSummary cs = inputFormat.getContentSummary(symlinkDir, job);
+    
+    assertEquals(0, cs.getLength());
+    assertEquals(0, cs.getFileCount());
+    assertEquals(0, cs.getDirectoryCount());
+    
     InputSplit[] splits = inputFormat.getSplits(job, 2);
 
     log.info("Number of splits: " + splits.length);