You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2020/06/04 18:00:58 UTC

[impala] branch master updated (37b5599 -> 3713d5d)

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git.


    from 37b5599  IMPALA-9809: Multi-aggregation query on particular dataset crashes impalad
     new 3c71586  IMPALA-9723: Raise error when when Hive Streaming side-file is found
     new 3713d5d  IMPALA-9820: Pull Datasketches-5 HLL MurmurHash fix

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 be/src/thirdparty/datasketches/MurmurHash3.h       | 11 +++------
 .../java/org/apache/impala/util/AcidUtils.java     |  9 +++++++-
 .../java/org/apache/impala/util/AcidUtilsTest.java | 27 ++++++++++++++++++++++
 3 files changed, 38 insertions(+), 9 deletions(-)


[impala] 01/02: IMPALA-9723: Raise error when when Hive Streaming side-file is found

Posted by jo...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 3c715864674004011dd87e01187f6ed378506a91
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
AuthorDate: Fri May 29 17:07:03 2020 +0200

    IMPALA-9723: Raise error when when Hive Streaming side-file is found
    
    Currently Impala cannot read a Hive Streaming file when it is being
    appended, i.e. when a side-file tells the last committed file size.
    
    With this commit Impala raises an error during table loading whenever
    it encounters a side-file.
    
    Testing:
     * added new unit test to AcidUtilsTest
    
    Change-Id: I8223411570ec5e31bbb98b907cf0e5c235817760
    Reviewed-on: http://gerrit.cloudera.org:8080/16002
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../java/org/apache/impala/util/AcidUtils.java     |  9 +++++++-
 .../java/org/apache/impala/util/AcidUtilsTest.java | 27 ++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/fe/src/main/java/org/apache/impala/util/AcidUtils.java b/fe/src/main/java/org/apache/impala/util/AcidUtils.java
index e31bac7..b3d637a 100644
--- a/fe/src/main/java/org/apache/impala/util/AcidUtils.java
+++ b/fe/src/main/java/org/apache/impala/util/AcidUtils.java
@@ -335,7 +335,7 @@ public class AcidUtils {
 
   private static List<FileStatus> filterFilesForAcidState(List<FileStatus> stats,
       Path baseDir, long maxBaseWriteId, Set<String> deltaDirs,
-      @Nullable LoadStats loadStats) {
+      @Nullable LoadStats loadStats) throws MetaException {
     List<FileStatus> validStats = new ArrayList<>(stats);
     for (Iterator<FileStatus> it = validStats.iterator(); it.hasNext();) {
       FileStatus stat = it.next();
@@ -353,6 +353,13 @@ public class AcidUtils {
           it.remove();
           if (loadStats != null) loadStats.filesSupersededByAcidState++;
         }
+        if (relPath.endsWith("_flush_length")) {
+          throw new MetaException("Found Hive Streaming side-file: " +
+              stat.getPath() + " It means that the contents of the directory are " +
+              "currently being written, therefore Impala is not able to read it. " +
+              "Please try to load the table again once Hive Streaming commits " +
+              "the transaction.");
+        }
         continue;
       }
       long baseWriteId = getBaseWriteId(relPath);
diff --git a/fe/src/test/java/org/apache/impala/util/AcidUtilsTest.java b/fe/src/test/java/org/apache/impala/util/AcidUtilsTest.java
index 6598687..d75ee64 100644
--- a/fe/src/test/java/org/apache/impala/util/AcidUtilsTest.java
+++ b/fe/src/test/java/org/apache/impala/util/AcidUtilsTest.java
@@ -507,6 +507,33 @@ public class AcidUtilsTest {
         );
   }
 
+  public void testHiveStreamingFail() {
+    filteringError(new String[]{
+            "base_0000005/",
+            "base_0000005/abc.txt",
+            "delta_0000006_0000016/",
+            "delta_0000006_0000016/00000_0",
+            "delta_0000006_0000016/00000_0_flush_length"},
+        // all txns are valid
+        "",
+        // <tbl>:<hwm>:<minOpenWriteId>:<openWriteIds>:<abortedWriteIds>
+        "default.test:22:1234:1,2,3",
+        "Found Hive Streaming side-file"
+        );
+    assertFiltering(new String[]{
+            "base_0000005/",
+            "base_0000005/abc.txt",
+            "delta_0000006_0000016/",
+            "delta_0000006_0000016/00000_0",
+            "delta_0000006_0000016/00000_0_flush_length",
+            "base_0000017_v123/0000_0"},
+        // all txns are valid
+        "",
+        // <tbl>:<hwm>:<minOpenWriteId>:<openWriteIds>:<abortedWriteIds>
+        "default.test:22:1234:1,2,3",
+        new String[]{"base_0000017_v123/0000_0"});
+  }
+
   @Test
   public void testMinorCompactionBeforeBase() {
     assertFiltering(new String[]{


[impala] 02/02: IMPALA-9820: Pull Datasketches-5 HLL MurmurHash fix

Posted by jo...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 3713d5db8dcac540ce0b5cb45974054ca87792db
Author: Gabor Kaszab <ga...@cloudera.com>
AuthorDate: Tue Jun 2 22:08:38 2020 +0200

    IMPALA-9820: Pull Datasketches-5 HLL MurmurHash fix
    
    There is a bug in DataSketches HLL MurmurHash where long strings are
    over-read resulting a cardinality estimate that is more than 15% off
    from the correct cardinality number. A recent upstream fix in Apache
    DataSketches addresses this issue and this patch pulls it to Impala.
    
    https://issues.apache.org/jira/browse/DATASKETCHES-5
    
    Testing:
      - I used ds_hll_sketch() and ds_hll_estimate() functions from
        IMPALA-9632 to trigger DataSketches HLL functionality.
      - Ran DataSketches HLL on lineitem.l_comment in TPCH25_parquet to
        reproduce the issue. The symptom was that the actual result was
        around 15% off from the correct cardinality result (~69M vs 79M).
      - After applying this fix re-running the query gives much closer
        results, usually under 3% error range.
    
    Change-Id: I84d73fce1e7a197c1f8fb49404b58ed9bb0b843d
    Reviewed-on: http://gerrit.cloudera.org:8080/16026
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/thirdparty/datasketches/MurmurHash3.h | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/be/src/thirdparty/datasketches/MurmurHash3.h b/be/src/thirdparty/datasketches/MurmurHash3.h
index 45a64c6..f68e989 100644
--- a/be/src/thirdparty/datasketches/MurmurHash3.h
+++ b/be/src/thirdparty/datasketches/MurmurHash3.h
@@ -104,14 +104,12 @@ FORCE_INLINE void MurmurHash3_x64_128(const void* key, int lenBytes, uint64_t se
   out.h2 = seed;
 
   // Number of full 128-bit blocks of 16 bytes.
-  // Possible exclusion fo a remainder of up to 15 bytes.
+  // Possible exclusion of a remainder of up to 15 bytes.
   const int nblocks = lenBytes >> 4; // bytes / 16 
 
-  // Process the 128-bit blocks (the body) into teh hash
+  // Process the 128-bit blocks (the body) into the hash
   const uint64_t* blocks = (const uint64_t*)(data);
   for (int i = 0; i < nblocks; ++i) { // 16 bytes per block
-    //uint64_t k1 = getblock64(blocks, 0);
-    //uint64_t k2 = getblock64(blocks, 1);
     uint64_t k1 = getblock64(blocks,i*2+0);
     uint64_t k2 = getblock64(blocks,i*2+1);
 
@@ -124,12 +122,9 @@ FORCE_INLINE void MurmurHash3_x64_128(const void* key, int lenBytes, uint64_t se
     out.h2 = ROTL64(out.h2,31);
     out.h2 += out.h1;
     out.h2 = out.h2*5+0x38495ab5;
-
-    blocks += 2;
   }
 
   // tail
-  //const uint8_t * tail = (const uint8_t*)blocks;
   const uint8_t * tail = (const uint8_t*)(data + (nblocks << 4));
 
   uint64_t k1 = 0;
@@ -175,4 +170,4 @@ FORCE_INLINE void MurmurHash3_x64_128(const void* key, int lenBytes, uint64_t se
 
 //-----------------------------------------------------------------------------
 
-#endif // _MURMURHASH3_H_
\ No newline at end of file
+#endif // _MURMURHASH3_H_