You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by nz...@apache.org on 2011/02/11 00:27:35 UTC
svn commit: r1069611 - in /hive/trunk: ./
ql/src/java/org/apache/hadoop/hive/ql/exec/
ql/src/java/org/apache/hadoop/hive/ql/io/
ql/src/test/org/apache/hadoop/hive/ql/io/
Author: nzhang
Date: Thu Feb 10 23:27:34 2011
New Revision: 1069611
URL: http://svn.apache.org/viewvc?rev=1069611&view=rev
Log:
HIVE-1978 Hive SymlinkTextInputFormat does not estimate input size correctly (He Yongqiang via Ning Zhang)
Added:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/ContentSummaryInputFormat.java
Modified:
hive/trunk/CHANGES.txt
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/SymlinkTextInputFormat.java
hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java
Modified: hive/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hive/trunk/CHANGES.txt?rev=1069611&r1=1069610&r2=1069611&view=diff
==============================================================================
--- hive/trunk/CHANGES.txt (original)
+++ hive/trunk/CHANGES.txt Thu Feb 10 23:27:34 2011
@@ -391,6 +391,9 @@ Trunk - Unreleased
HIVE-1865 redo zookeeper hive lock manager
(namit via He Yongqiang)
+ HIVE-1978 Hive SymlinkTextInputFormat does not estimate input size correctly
+ (He Yongqiang via Ning Zhang)
+
OPTIMIZATIONS
BUG FIXES
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java?rev=1069611&r1=1069610&r2=1069611&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java Thu Feb 10 23:27:34 2011
@@ -80,6 +80,8 @@ import org.apache.hadoop.hive.metastore.
import org.apache.hadoop.hive.metastore.api.Order;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.QueryPlan;
+import org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat;
+import org.apache.hadoop.hive.ql.io.HiveInputFormat;
import org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat;
import org.apache.hadoop.hive.ql.io.RCFile;
import org.apache.hadoop.hive.ql.metadata.HiveException;
@@ -106,6 +108,7 @@ import org.apache.hadoop.io.SequenceFile
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
@@ -1438,8 +1441,19 @@ public final class Utilities {
ContentSummary cs = ctx.getCS(path);
if (cs == null) {
- FileSystem fs = p.getFileSystem(ctx.getConf());
- cs = fs.getContentSummary(p);
+ JobConf jobConf = new JobConf(ctx.getConf());
+ PartitionDesc partDesc = work.getPathToPartitionInfo().get(
+ p.toString());
+ Class<? extends InputFormat> inputFormatCls = partDesc
+ .getInputFileFormatClass();
+ InputFormat inputFormatObj = HiveInputFormat.getInputFormatFromCache(
+ inputFormatCls, jobConf);
+ if(inputFormatObj instanceof ContentSummaryInputFormat) {
+ cs = ((ContentSummaryInputFormat) inputFormatObj).getContentSummary(p, jobConf);
+ } else {
+ FileSystem fs = p.getFileSystem(ctx.getConf());
+ cs = fs.getContentSummary(p);
+ }
ctx.addCS(path, cs);
}
Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/ContentSummaryInputFormat.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/ContentSummaryInputFormat.java?rev=1069611&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/ContentSummaryInputFormat.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/ContentSummaryInputFormat.java Thu Feb 10 23:27:34 2011
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.io;
+
+import java.io.IOException;
+
+import org.apache.hadoop.fs.ContentSummary;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobConf;
+
+/**
+ * ContentSummayInputFormat provides an interface to let the input format itself
+ * figure the content summary for a give input path.
+ */
+public interface ContentSummaryInputFormat {
+
+ public ContentSummary getContentSummary(Path p, JobConf job)
+ throws IOException;
+
+}
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/SymlinkTextInputFormat.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/SymlinkTextInputFormat.java?rev=1069611&r1=1069610&r2=1069611&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/SymlinkTextInputFormat.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/SymlinkTextInputFormat.java Thu Feb 10 23:27:34 2011
@@ -9,6 +9,7 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@@ -32,7 +33,7 @@ import org.apache.hadoop.mapred.TextInpu
*/
@SuppressWarnings("deprecation")
public class SymlinkTextInputFormat
- implements InputFormat<LongWritable, Text>, JobConfigurable {
+ implements InputFormat<LongWritable, Text>, JobConfigurable, ContentSummaryInputFormat {
/**
* This input split wraps the FileSplit generated from
* TextInputFormat.getSplits(), while setting the original link file path
@@ -181,4 +182,31 @@ public class SymlinkTextInputFormat
public void validateInput(JobConf job) throws IOException {
// do nothing
}
+
+ @Override
+ public ContentSummary getContentSummary(Path p, JobConf job)
+ throws IOException {
+ //length, file count, directory count
+ long[] summary = {0, 0, 0};
+ List<Path> targetPaths = new ArrayList<Path>();
+ List<Path> symlinkPaths = new ArrayList<Path>();
+ try {
+ getTargetPathsFromSymlinksDirs(
+ job,
+ new Path[]{p},
+ targetPaths,
+ symlinkPaths);
+ } catch (Exception e) {
+ throw new IOException(
+ "Error parsing symlinks from specified job input path.", e);
+ }
+ for(Path path : targetPaths) {
+ FileSystem fs = path.getFileSystem(job);
+ ContentSummary cs = fs.getContentSummary(path);
+ summary[0] += cs.getLength();
+ summary[1] += cs.getFileCount();
+ summary[2] += cs.getDirectoryCount();
+ }
+ return new ContentSummary(summary[0], summary[1], summary[2]);
+ }
}
Modified: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java?rev=1069611&r1=1069610&r2=1069611&view=diff
==============================================================================
--- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java (original)
+++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java Thu Feb 10 23:27:34 2011
@@ -10,6 +10,7 @@ import junit.framework.TestCase;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
@@ -64,31 +65,53 @@ public class TestSymlinkTextInputFormat
*/
public void testAccuracy1() throws IOException {
// First data dir, contains 2 files.
- writeTextFile(new Path(dataDir1, "file1"),
+
+ FileSystem fs = dataDir1.getFileSystem(job);
+ int symbolLinkedFileSize = 0;
+
+ Path dir1_file1 = new Path(dataDir1, "file1");
+ writeTextFile(dir1_file1,
"dir1_file1_line1\n" +
"dir1_file1_line2\n");
- writeTextFile(new Path(dataDir1, "file2"),
+
+ symbolLinkedFileSize += fs.getFileStatus(dir1_file1).getLen();
+
+ Path dir1_file2 = new Path(dataDir1, "file2");
+ writeTextFile(dir1_file2,
"dir1_file2_line1\n" +
"dir1_file2_line2\n");
-
+
// Second data dir, contains 2 files.
- writeTextFile(new Path(dataDir2, "file1"),
+
+ Path dir2_file1 = new Path(dataDir2, "file1");
+ writeTextFile(dir2_file1,
"dir2_file1_line1\n" +
"dir2_file1_line2\n");
- writeTextFile(new Path(dataDir2, "file2"),
+
+ Path dir2_file2 = new Path(dataDir2, "file2");
+ writeTextFile(dir2_file2,
"dir2_file2_line1\n" +
"dir2_file2_line2\n");
+ symbolLinkedFileSize += fs.getFileStatus(dir2_file2).getLen();
+
// A symlink file, contains first file from first dir and second file from
// second dir.
writeSymlinkFile(
new Path(symlinkDir, "symlink_file"),
new Path(dataDir1, "file1"),
new Path(dataDir2, "file2"));
+
+ SymlinkTextInputFormat inputFormat = new SymlinkTextInputFormat();
+
+ //test content summary
+ ContentSummary cs = inputFormat.getContentSummary(symlinkDir, job);
+
+ assertEquals(symbolLinkedFileSize, cs.getLength());
+ assertEquals(2, cs.getFileCount());
+ assertEquals(0, cs.getDirectoryCount());
FileInputFormat.setInputPaths(job, symlinkDir);
-
- SymlinkTextInputFormat inputFormat = new SymlinkTextInputFormat();
InputSplit[] splits = inputFormat.getSplits(job, 2);
log.info("Number of splits: " + splits.length);
@@ -126,6 +149,13 @@ public class TestSymlinkTextInputFormat
FileInputFormat.setInputPaths(job, symlinkDir);
SymlinkTextInputFormat inputFormat = new SymlinkTextInputFormat();
+
+ ContentSummary cs = inputFormat.getContentSummary(symlinkDir, job);
+
+ assertEquals(0, cs.getLength());
+ assertEquals(0, cs.getFileCount());
+ assertEquals(0, cs.getDirectoryCount());
+
InputSplit[] splits = inputFormat.getSplits(job, 2);
log.info("Number of splits: " + splits.length);