You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2021/04/26 15:46:45 UTC

[impala] 01/04: IMPALA-10658: LOAD DATA INPATH silently fails between HDFS and Azure ABFS

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 8336b7b3cd8ba90d37c3f7454a9c9c4074bca1f0
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
AuthorDate: Tue Apr 13 22:04:49 2021 +0200

    IMPALA-10658: LOAD DATA INPATH silently fails between HDFS and Azure ABFS
    
    LOAD DATA INPATH silently fails when Impala tries to move files from
    HDFS to ABFS. The problem is that we use FileSystem.makeQualified(Path)
    to decide if path is on a given filesystem. We expect to get an
    IllegalArgumentException if path is on a different filesystem. However,
    the Azure FileSystem implementation doesn't throw this exception.
    
    Because of that Impala thinks that an 'hdfs://' path and an 'abfs://'
    path is on the same filesystem, so it tries to move files with
    FileSystem.rename(). In case of errors rename() might throw an
    Exception, or return false. Impala doesn't check the return value,
    therefore if rename() returns false then the error remains silent.
    
    This patch fixes Impala's isPathOnFileSystem() and also adds a check
    for the return value of rename().
    
    Testing:
     * tested manually between HDFS and Azure ABFS.
     * added JUnit test to FileSystemUtilTest
    
    Change-Id: Id807e8a200b83283a09d3a917185cabab930017d
    Reviewed-on: http://gerrit.cloudera.org:8080/17316
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../org/apache/impala/common/FileSystemUtil.java   | 16 ++++++---
 .../apache/impala/common/FileSystemUtilTest.java   | 40 +++++++++++++++++++++-
 2 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/common/FileSystemUtil.java b/fe/src/main/java/org/apache/impala/common/FileSystemUtil.java
index 50e4ead..9393009 100644
--- a/fe/src/main/java/org/apache/impala/common/FileSystemUtil.java
+++ b/fe/src/main/java/org/apache/impala/common/FileSystemUtil.java
@@ -263,7 +263,10 @@ public class FileSystemUtil {
             "Moving '%s' to '%s'", sourceFile.toString(), destFile.toString()));
       }
       // Move (rename) the file.
-      destFs.rename(sourceFile, destFile);
+      if (!destFs.rename(sourceFile, destFile)) {
+        throw new IOException(String.format(
+            "Failed to move '%s' to '%s'", sourceFile, destFile));
+      }
       return;
     }
     if (destIsDfs && sameFileSystem) {
@@ -579,13 +582,16 @@ public class FileSystemUtil {
    * Return true iff the path is on the given filesystem.
    */
   public static boolean isPathOnFileSystem(Path path, FileSystem fs) {
+    // Path 'path' must be qualified already.
+    Preconditions.checkState(
+        !path.equals(Path.getPathWithoutSchemeAndAuthority(path)),
+        String.format("Path '%s' is not qualified.", path));
     try {
-      // Call makeQualified() for the side-effect of FileSystem.checkPath() which will
-      // throw an exception if path is not on fs.
-      fs.makeQualified(path);
-      return true;
+      Path qp = fs.makeQualified(path);
+      return path.equals(qp);
     } catch (IllegalArgumentException e) {
       // Path is not on fs.
+      LOG.debug(String.format("Path '%s' is not on file system '%s'", path, fs));
       return false;
     }
   }
diff --git a/fe/src/test/java/org/apache/impala/common/FileSystemUtilTest.java b/fe/src/test/java/org/apache/impala/common/FileSystemUtilTest.java
index 2ed60f6..3105690 100644
--- a/fe/src/test/java/org/apache/impala/common/FileSystemUtilTest.java
+++ b/fe/src/test/java/org/apache/impala/common/FileSystemUtilTest.java
@@ -30,6 +30,9 @@ import org.junit.Test;
 import org.mockito.Mockito;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
 
 /**
  * Tests for the various util methods in FileSystemUtil class
@@ -163,6 +166,41 @@ public class FileSystemUtilTest {
     // TODO: enable following tests if we add them into impala mini cluster.
     // testValidLoadDataInpath(mockLocation(FileSystemUtil.SCHEME_O3FS), true);
     // testValidLoadDataInpath(mockLocation(FileSystemUtil.SCHEME_ALLUXIO), false);
+    // Also extend testIsPathOnFileSystem().
+  }
+
+  @Test
+  public void testIsPathOnFileSystem() throws IOException {
+    List<String> schemes = Arrays.asList(
+        FileSystemUtil.SCHEME_ABFS,
+        FileSystemUtil.SCHEME_ABFSS,
+        FileSystemUtil.SCHEME_ADL,
+        FileSystemUtil.SCHEME_HDFS,
+        FileSystemUtil.SCHEME_S3A,
+        FileSystemUtil.SCHEME_FILE);
+    List<Path> mockFiles = new ArrayList<>();
+    for (String scheme : schemes) {
+      mockFiles.add(new Path(mockLocation(scheme)));
+    }
+    List<FileSystem> fileSystems = new ArrayList<>();
+    for (Path mockFile : mockFiles) {
+      fileSystems.add(FileSystemUtil.getFileSystemForPath(mockFile));
+    }
+    for (int i = 0; i < fileSystems.size(); ++i) {
+      FileSystem fs = fileSystems.get(i);
+      for (int j = 0; j < mockFiles.size(); ++j) {
+        Path mockFile = mockFiles.get(j);
+        if (i == j) {
+          assertTrue(String.format(
+                          "Path '%s' should be on file system '%s'", mockFile, fs),
+                     FileSystemUtil.isPathOnFileSystem(mockFile, fs));
+        } else {
+          assertFalse(String.format(
+                          "Path '%s' should not be on file system '%s'", mockFile, fs),
+                      FileSystemUtil.isPathOnFileSystem(mockFile, fs));
+        }
+      }
+    }
   }
 
   private boolean testIsInIgnoredDirectory(Path input) {
@@ -185,7 +223,7 @@ public class FileSystemUtilTest {
       case FileSystemUtil.SCHEME_ADL:
         return "adl://dummy-account.azuredatalakestore.net/dummy-part-3";
       case FileSystemUtil.SCHEME_FILE:
-        return "file://tmp/dummy-part-4";
+        return "file:///tmp/dummy-part-4";
       case FileSystemUtil.SCHEME_HDFS:
         return "hdfs://localhost:20500/dummy-part-5";
       case FileSystemUtil.SCHEME_S3A: