You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by na...@apache.org on 2009/06/25 03:00:33 UTC

svn commit: r788232 - in /hadoop/hive/trunk: ./ ql/src/java/org/apache/hadoop/hive/ql/exec/ ql/src/java/org/apache/hadoop/hive/ql/io/ ql/src/test/queries/clientnegative/ ql/src/test/results/clientnegative/

Author: namit
Date: Thu Jun 25 01:00:32 2009
New Revision: 788232

URL: http://svn.apache.org/viewvc?rev=788232&view=rev
Log:
HIVE-472. HiveFileFormatUtils's checkInputFormat does not include RCFile.
(He Yongqiang via namit)


Added:
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/InputFormatChecker.java
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/SequenceFileInputFormatChecker.java
    hadoop/hive/trunk/ql/src/test/queries/clientnegative/load_wrong_fileformat_rc_seq.q
    hadoop/hive/trunk/ql/src/test/queries/clientnegative/load_wrong_fileformat_txt_seq.q
    hadoop/hive/trunk/ql/src/test/results/clientnegative/load_wrong_fileformat_rc_seq.q.out
    hadoop/hive/trunk/ql/src/test/results/clientnegative/load_wrong_fileformat_txt_seq.q.out
Modified:
    hadoop/hive/trunk/CHANGES.txt
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/RCFileInputFormat.java
    hadoop/hive/trunk/ql/src/test/results/clientnegative/load_wrong_fileformat.q.out

Modified: hadoop/hive/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/CHANGES.txt?rev=788232&r1=788231&r2=788232&view=diff
==============================================================================
--- hadoop/hive/trunk/CHANGES.txt (original)
+++ hadoop/hive/trunk/CHANGES.txt Thu Jun 25 01:00:32 2009
@@ -267,6 +267,9 @@
 
     HIVE-575. Fix Map join out-of-memory problem. (Namit Jain via zshao)
 
+    HIVE-472. HiveFileFormatUtils's checkInputFormat does not include RCFile.
+    (He Yongqiang via namit)
+
 Release 0.3.1 - Unreleased
 
   INCOMPATIBLE CHANGES

Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java?rev=788232&r1=788231&r2=788232&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java Thu Jun 25 01:00:32 2009
@@ -123,7 +123,9 @@
           }
 
           // Check if the file format of the file matches that of the table.
-          HiveFileFormatUtils.checkInputFormat(fs, conf, tbd.getTable().getInputFileFormatClass(), files);
+          boolean flag = HiveFileFormatUtils.checkInputFormat(fs, conf, tbd.getTable().getInputFileFormatClass(), files);
+          if(!flag)
+            throw new HiveException("Wrong file format. Please check the file's format.");
         }
 
         if(tbd.getPartitionSpec().size() == 0) {

Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java?rev=788232&r1=788231&r2=788232&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java Thu Jun 25 01:00:32 2009
@@ -22,6 +22,7 @@
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.Set;
 
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
@@ -35,6 +36,7 @@
 import org.apache.hadoop.mapred.OutputFormat;
 import org.apache.hadoop.mapred.SequenceFileInputFormat;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.TextInputFormat;
 
 /**
  * An util class for various Hive file format tasks.
@@ -43,9 +45,6 @@
  * compatibility. They return the newly added HiveOutputFormat for the older 
  * ones.
  * 
- * }
- * 
- * 
  */
 public class HiveFileFormatUtils {
 
@@ -59,7 +58,7 @@
 
   @SuppressWarnings("unchecked")
   private static Map<Class<? extends OutputFormat>, Class<? extends HiveOutputFormat>> outputFormatSubstituteMap;
-
+  
   /**
    * register a substitute
    * 
@@ -104,35 +103,87 @@
     }
     return defaultFinalPath;
   }
+  
+  static {
+    inputFormatCheckerMap = new HashMap<Class<? extends InputFormat>, Class<? extends InputFormatChecker>>();
+    HiveFileFormatUtils.registerInputFormatChecker(SequenceFileInputFormat.class, SequenceFileInputFormatChecker.class);
+    HiveFileFormatUtils.registerInputFormatChecker(RCFileInputFormat.class, RCFileInputFormat.class);
+    inputFormatCheckerInstanceCache = new HashMap<Class<? extends InputFormatChecker>, InputFormatChecker>();
+  }
+
+  @SuppressWarnings("unchecked")
+  private static Map<Class<? extends InputFormat>, Class<? extends InputFormatChecker>> inputFormatCheckerMap;
+
+  private static Map<Class<? extends InputFormatChecker>, InputFormatChecker> inputFormatCheckerInstanceCache;
+
+  /**
+   * register an InputFormatChecker for a given InputFormat
+   * 
+   * @param format
+   *          the class that need to be substituted
+   * @param checker
+   */
+  @SuppressWarnings("unchecked")
+  public synchronized static void registerInputFormatChecker(
+      Class<? extends InputFormat> format,
+      Class<? extends InputFormatChecker> checker) {
+    inputFormatCheckerMap.put(format, checker);
+  }
+
+	/**
+   * get an InputFormatChecker for a file format.
+   */
+  public synchronized static Class<? extends InputFormatChecker> getInputFormatChecker(
+      Class<?> inputFormat) {
+    Class<? extends InputFormatChecker> result = inputFormatCheckerMap.get(inputFormat);
+    return result;
+  }
 
   /**
    * checks if files are in same format as the given input format
    */
+  @SuppressWarnings("unchecked")
   public static boolean checkInputFormat(FileSystem fs, HiveConf conf,
       Class<? extends InputFormat> inputFormatCls, ArrayList<FileStatus> files)
       throws HiveException {
     if (files.size() > 0) {
-      boolean tableIsSequenceFile = inputFormatCls
-          .equals(SequenceFileInputFormat.class);
-      int fileId = 0;
-      boolean fileIsSequenceFile = true;
-      try {
-        SequenceFile.Reader reader = new SequenceFile.Reader(fs, files.get(
-            fileId).getPath(), conf);
-        reader.close();
-      } catch (IOException e) {
-        fileIsSequenceFile = false;
-      }
-      if (!fileIsSequenceFile && tableIsSequenceFile) {
-        throw new HiveException(
-            "Cannot load text files into a table stored as SequenceFile.");
+      Class<? extends InputFormatChecker> checkerCls = getInputFormatChecker(inputFormatCls);
+      if(checkerCls==null && inputFormatCls.isAssignableFrom(TextInputFormat.class)) {
+        // we get a text input format here, we can not determine a file is text
+        // according to its content, so we can do is to test if other file
+        // format can accept it. If one other file format can accept this file,
+        // we treat this file as text file, although it maybe not.
+       return checkTextInputFormat(fs, conf, files);
       }
-      if (fileIsSequenceFile && !tableIsSequenceFile) {
-        throw new HiveException(
-            "Cannot load SequenceFiles into a table stored as TextFile.");
+      
+      if (checkerCls != null) {
+        InputFormatChecker checkerInstance = inputFormatCheckerInstanceCache
+            .get(checkerCls);
+        try {
+          if (checkerInstance == null) {
+            checkerInstance = checkerCls.newInstance();
+            inputFormatCheckerInstanceCache.put(checkerCls, checkerInstance);
+          }
+          return checkerInstance.validateInput(fs, conf, files);
+        } catch (Exception e) {
+          throw new HiveException(e);
+        }
       }
       return true;
     }
     return false;
   }
+
+  @SuppressWarnings("unchecked")
+  private static boolean checkTextInputFormat(FileSystem fs, HiveConf conf,
+      ArrayList<FileStatus> files) throws HiveException {
+    Set<Class<? extends InputFormat>> inputFormatter = inputFormatCheckerMap
+        .keySet();
+    for (Class<? extends InputFormat> reg : inputFormatter) {
+      boolean result = checkInputFormat(fs, conf, reg, files);
+      if (result)
+        return false;
+    }
+    return true;
+  }
 }

Added: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/InputFormatChecker.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/InputFormatChecker.java?rev=788232&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/InputFormatChecker.java (added)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/InputFormatChecker.java Thu Jun 25 01:00:32 2009
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.io;
+
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.hive.conf.HiveConf;
+
+/**
+ * Check for validity of the input files.
+ */
+public interface InputFormatChecker {
+
+	/**
+	 * This method is used to validate the input files
+	 * 
+	 */
+	public boolean validateInput(FileSystem fs, HiveConf conf, ArrayList<FileStatus> files) throws IOException;
+
+}

Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/RCFileInputFormat.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/RCFileInputFormat.java?rev=788232&r1=788231&r2=788232&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/RCFileInputFormat.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/RCFileInputFormat.java Thu Jun 25 01:00:32 2009
@@ -19,7 +19,11 @@
 package org.apache.hadoop.hive.ql.io;
 
 import java.io.IOException;
+import java.util.ArrayList;
 
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.SequenceFile;
@@ -31,7 +35,7 @@
 import org.apache.hadoop.mapred.Reporter;
 
 public class RCFileInputFormat<K extends LongWritable, V extends BytesRefArrayWritable>
-    extends FileInputFormat<K, V> {
+    extends FileInputFormat<K, V> implements InputFormatChecker{
 
   public RCFileInputFormat() {
     setMinSplitSize(SequenceFile.SYNC_INTERVAL);
@@ -45,4 +49,20 @@
 
     return new RCFileRecordReader(job, (FileSplit) split);
   }
+
+  @Override
+  public boolean validateInput(FileSystem fs, HiveConf conf,
+      ArrayList<FileStatus> files) throws IOException {
+    if (files.size() <= 0)
+      return false;
+    for (int fileId = 0; fileId < files.size(); fileId++) {
+      try {
+        RCFile.Reader reader = new RCFile.Reader(fs, files.get(fileId).getPath(), conf);
+        reader.close();
+      } catch (IOException e) {
+        return false;
+      }
+    }
+    return true;
+  }
 }

Added: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/SequenceFileInputFormatChecker.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/SequenceFileInputFormatChecker.java?rev=788232&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/SequenceFileInputFormatChecker.java (added)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/SequenceFileInputFormatChecker.java Thu Jun 25 01:00:32 2009
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.io;
+
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.io.SequenceFile;
+
+public class SequenceFileInputFormatChecker implements InputFormatChecker {
+
+  @Override
+  public boolean validateInput(FileSystem fs, HiveConf conf,
+      ArrayList<FileStatus> files) throws IOException {
+    if (files.size() <= 0)
+      return false;
+    for (int fileId = 0; fileId < files.size(); fileId++) {
+      try {
+        SequenceFile.Reader reader = new SequenceFile.Reader(fs, files.get(
+            fileId).getPath(), conf);
+        reader.close();
+      } catch (IOException e) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+}

Added: hadoop/hive/trunk/ql/src/test/queries/clientnegative/load_wrong_fileformat_rc_seq.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientnegative/load_wrong_fileformat_rc_seq.q?rev=788232&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientnegative/load_wrong_fileformat_rc_seq.q (added)
+++ hadoop/hive/trunk/ql/src/test/queries/clientnegative/load_wrong_fileformat_rc_seq.q Thu Jun 25 01:00:32 2009
@@ -0,0 +1,6 @@
+-- test for loading into tables with the correct file format
+-- test for loading into partitions with the correct file format
+
+DROP TABLE T1;
+CREATE TABLE T1(name STRING) STORED AS RCFILE;
+LOAD DATA LOCAL INPATH '../data/files/kv1.seq' INTO TABLE T1;
\ No newline at end of file

Added: hadoop/hive/trunk/ql/src/test/queries/clientnegative/load_wrong_fileformat_txt_seq.q
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/queries/clientnegative/load_wrong_fileformat_txt_seq.q?rev=788232&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/queries/clientnegative/load_wrong_fileformat_txt_seq.q (added)
+++ hadoop/hive/trunk/ql/src/test/queries/clientnegative/load_wrong_fileformat_txt_seq.q Thu Jun 25 01:00:32 2009
@@ -0,0 +1,6 @@
+-- test for loading into tables with the correct file format
+-- test for loading into partitions with the correct file format
+
+DROP TABLE T1;
+CREATE TABLE T1(name STRING) STORED AS TEXTFILE;
+LOAD DATA LOCAL INPATH '../data/files/kv1.seq' INTO TABLE T1;
\ No newline at end of file

Modified: hadoop/hive/trunk/ql/src/test/results/clientnegative/load_wrong_fileformat.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientnegative/load_wrong_fileformat.q.out?rev=788232&r1=788231&r2=788232&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientnegative/load_wrong_fileformat.q.out (original)
+++ hadoop/hive/trunk/ql/src/test/results/clientnegative/load_wrong_fileformat.q.out Thu Jun 25 01:00:32 2009
@@ -4,5 +4,5 @@
 DROP TABLE T1
 query: CREATE TABLE T1(name STRING) STORED AS SEQUENCEFILE
 query: LOAD DATA LOCAL INPATH '../data/files/kv1.txt' INTO TABLE T1
-Failed with exception Cannot load text files into a table stored as SequenceFile.
+Failed with exception Wrong file format. Please check the file's format.
 FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.MoveTask

Added: hadoop/hive/trunk/ql/src/test/results/clientnegative/load_wrong_fileformat_rc_seq.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientnegative/load_wrong_fileformat_rc_seq.q.out?rev=788232&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientnegative/load_wrong_fileformat_rc_seq.q.out (added)
+++ hadoop/hive/trunk/ql/src/test/results/clientnegative/load_wrong_fileformat_rc_seq.q.out Thu Jun 25 01:00:32 2009
@@ -0,0 +1,8 @@
+query: -- test for loading into tables with the correct file format
+-- test for loading into partitions with the correct file format
+
+DROP TABLE T1
+query: CREATE TABLE T1(name STRING) STORED AS RCFILE
+query: LOAD DATA LOCAL INPATH '../data/files/kv1.seq' INTO TABLE T1
+Failed with exception Wrong file format. Please check the file's format.
+FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.MoveTask

Added: hadoop/hive/trunk/ql/src/test/results/clientnegative/load_wrong_fileformat_txt_seq.q.out
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/test/results/clientnegative/load_wrong_fileformat_txt_seq.q.out?rev=788232&view=auto
==============================================================================
--- hadoop/hive/trunk/ql/src/test/results/clientnegative/load_wrong_fileformat_txt_seq.q.out (added)
+++ hadoop/hive/trunk/ql/src/test/results/clientnegative/load_wrong_fileformat_txt_seq.q.out Thu Jun 25 01:00:32 2009
@@ -0,0 +1,8 @@
+query: -- test for loading into tables with the correct file format
+-- test for loading into partitions with the correct file format
+
+DROP TABLE T1
+query: CREATE TABLE T1(name STRING) STORED AS TEXTFILE
+query: LOAD DATA LOCAL INPATH '../data/files/kv1.seq' INTO TABLE T1
+Failed with exception Wrong file format. Please check the file's format.
+FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.MoveTask