You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2021/02/05 08:55:45 UTC
[impala] branch master updated: IMPALA-10459: Remove workarounds for MAPREDUCE-6441

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new e71ea69  IMPALA-10459: Remove workarounds for MAPREDUCE-6441
e71ea69 is described below

commit e71ea69bb8e80efc5559309195c3b62b06015530
Author: stiga-huang <hu...@gmail.com>
AuthorDate: Thu Jan 28 16:26:31 2021 +0800

    IMPALA-10459: Remove workarounds for MAPREDUCE-6441
    
    MAPREDUCE-6441 is resolved and is in our toolchain. This patch removes
    workarounds for it.
    
    Tests:
     - Ran exhaustive test.
    
    Change-Id: I5c4d482a6d15cdc08e9cf8878e130399665a8ee0
    Reviewed-on: http://gerrit.cloudera.org:8080/17011
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 bin/load-data.py                                   | 22 +------
 .../catalog/events/EventsProcessorStressTest.java  |  2 -
 .../apache/impala/util/RandomHiveQueryRunner.java  |  7 --
 tests/common/impala_test_suite.py                  | 77 ++++++++--------------
 4 files changed, 28 insertions(+), 80 deletions(-)

diff --git a/bin/load-data.py b/bin/load-data.py
index a7eb883..ef40730 100755
--- a/bin/load-data.py
+++ b/bin/load-data.py
@@ -26,11 +26,9 @@ import logging
 import multiprocessing
 import os
 import re
-import shutil
 import sqlparse
 import subprocess
 import sys
-import tempfile
 import time
 import traceback
 
@@ -148,27 +146,9 @@ def exec_hive_query_from_file_beeline(file_name):
 
   LOG.info("Beginning execution of hive SQL: {0}".format(file_name))
 
-  # When HiveServer2 is configured to use "local" mode (i.e., MR jobs are run
-  # in-process rather than on YARN), Hadoop's LocalDistributedCacheManager has a
-  # race, wherein it tires to localize jars into
-  # /tmp/hadoop-$USER/mapred/local/<millis>. Two simultaneous Hive queries
-  # against HS2 can conflict here. Weirdly LocalJobRunner handles a similar issue
-  # (with the staging directory) by appending a random number. To over come this,
-  # in the case that HS2 is on the local machine (which we conflate with also
-  # running MR jobs locally), we move the temporary directory into a unique
-  # directory via configuration. This block can be removed when
-  # https://issues.apache.org/jira/browse/MAPREDUCE-6441 is resolved.
-  hive_args = HIVE_ARGS
-  unique_dir = None
-  if options.hive_hs2_hostport.startswith("localhost:"):
-    unique_dir = tempfile.mkdtemp(prefix="hive-data-load-")
-    hive_args += ' --hiveconf "mapreduce.cluster.local.dir=%s"' % unique_dir
-
   output_file = file_name + ".log"
-  hive_cmd = "{0} {1} -f {2}".format(HIVE_CMD, hive_args, file_name)
+  hive_cmd = "{0} {1} -f {2}".format(HIVE_CMD, HIVE_ARGS, file_name)
   is_success = exec_cmd(hive_cmd, exit_on_error=False, out_file=output_file)
-  if unique_dir:
-    shutil.rmtree(unique_dir)
 
   if is_success:
     LOG.info("Finished execution of hive SQL: {0}".format(file_name))
diff --git a/fe/src/test/java/org/apache/impala/catalog/events/EventsProcessorStressTest.java b/fe/src/test/java/org/apache/impala/catalog/events/EventsProcessorStressTest.java
index 19da6fb..a340ed1 100644
--- a/fe/src/test/java/org/apache/impala/catalog/events/EventsProcessorStressTest.java
+++ b/fe/src/test/java/org/apache/impala/catalog/events/EventsProcessorStressTest.java
@@ -36,7 +36,6 @@ import org.apache.thrift.TException;
 import org.junit.AfterClass;
 import org.junit.Assert;
 import org.junit.BeforeClass;
-import org.junit.Ignore;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -189,7 +188,6 @@ public class EventsProcessorStressTest {
     }
   }
 
-  @Ignore("Ignored until MAPREDUCE-6441 is available in the toolchain")
   @Test
   public void testUsingRandomHiveQueries() throws Exception {
     LOG.info("Using number of clients: {} number of queries per client: {}", numClients_,
diff --git a/fe/src/test/java/org/apache/impala/util/RandomHiveQueryRunner.java b/fe/src/test/java/org/apache/impala/util/RandomHiveQueryRunner.java
index 1e6f91c..2ffed70 100644
--- a/fe/src/test/java/org/apache/impala/util/RandomHiveQueryRunner.java
+++ b/fe/src/test/java/org/apache/impala/util/RandomHiveQueryRunner.java
@@ -76,9 +76,6 @@ public class RandomHiveQueryRunner {
 
   private final List<QueryType> skippedQueryTypes;
 
-  // we need to add small delay between the start of each query to work-around
-  // MAPREDUCE-6441
-  private final Object delayLock_ = new Object();
   /**
    * Query type with weight. The weight of a QueryType determines the probability of its
    * occurrence by the Random Query runner. Higher the weight, more the probability of its
@@ -1104,10 +1101,6 @@ public class RandomHiveQueryRunner {
           try {
             LOG.info("Client {} running hive query set {}: {}", clientId, queryNumber,
                 query);
-            // add a delay between the start of each query to work around MAPREDUCE-6441
-            synchronized (delayLock_) {
-              Thread.sleep(10);
-            }
             query.run(hiveJdbcClientPool_);
             queryNumber++;
           } catch (Exception e) {
diff --git a/tests/common/impala_test_suite.py b/tests/common/impala_test_suite.py
index e5fcc70..dded47c 100644
--- a/tests/common/impala_test_suite.py
+++ b/tests/common/impala_test_suite.py
@@ -27,10 +27,8 @@ import pwd
 import pytest
 import re
 import requests
-import shutil
 import socket
 import subprocess
-import tempfile
 import time
 from functools import wraps
 from getpass import getuser
@@ -949,54 +947,33 @@ class ImpalaTestSuite(BaseTestSuite):
     Run a statement in Hive, returning stdout if successful and throwing
     RuntimeError(stderr) if not.
     """
-    # When HiveServer2 is configured to use "local" mode (i.e., MR jobs are run
-    # in-process rather than on YARN), Hadoop's LocalDistributedCacheManager has a
-    # race, wherein it tires to localize jars into
-    # /tmp/hadoop-$USER/mapred/local/<millis>. Two simultaneous Hive queries
-    # against HS2 can conflict here. Weirdly LocalJobRunner handles a similar issue
-    # (with the staging directory) by appending a random number. To overcome this,
-    # in the case that HS2 is on the local machine (which we conflate with also
-    # running MR jobs locally), we move the temporary directory into a unique
-    # directory via configuration. This workaround can be removed when
-    # https://issues.apache.org/jira/browse/MAPREDUCE-6441 is resolved.
-    # A similar workaround is used in bin/load-data.py.
-    tmpdir = None
-    beeline_opts = []
-    if pytest.config.option.hive_server2.startswith("localhost:"):
-      tmpdir = tempfile.mkdtemp(prefix="impala-tests-")
-      beeline_opts += ['--hiveconf', 'mapreduce.cluster.local.dir={0}'.format(tmpdir)]
-    try:
-      # Remove HADOOP_CLASSPATH from environment. Beeline doesn't need it,
-      # and doing so avoids Hadoop 3's classpath de-duplication code from
-      # placing $HADOOP_CONF_DIR too late in the classpath to get the right
-      # log4j configuration file picked up. Some log4j configuration files
-      # in Hadoop's jars send logging to stdout, confusing Impala's test
-      # framework.
-      env = os.environ.copy()
-      env.pop("HADOOP_CLASSPATH", None)
-      call = subprocess.Popen(
-          ['beeline',
-           '--outputformat=csv2',
-           '-u', 'jdbc:hive2://' + pytest.config.option.hive_server2,
-           '-n', username or getuser(),
-           '-e', stmt] + beeline_opts,
-          stdout=subprocess.PIPE,
-          stderr=subprocess.PIPE,
-          # Beeline in Hive 2.1 will read from stdin even when "-e"
-          # is specified; explicitly make sure there's nothing to
-          # read to avoid hanging, especially when running interactively
-          # with py.test.
-          stdin=file("/dev/null"),
-          env=env)
-      (stdout, stderr) = call.communicate()
-      call.wait()
-      if call.returncode != 0:
-        raise RuntimeError(stderr)
-      return stdout
-    finally:
-      # IMPALA-8315: removing the directory may race with Hive's own cleanup and cause
-      # errors.
-      if tmpdir is not None: shutil.rmtree(tmpdir, ignore_errors=True)
+    # Remove HADOOP_CLASSPATH from environment. Beeline doesn't need it,
+    # and doing so avoids Hadoop 3's classpath de-duplication code from
+    # placing $HADOOP_CONF_DIR too late in the classpath to get the right
+    # log4j configuration file picked up. Some log4j configuration files
+    # in Hadoop's jars send logging to stdout, confusing Impala's test
+    # framework.
+    env = os.environ.copy()
+    env.pop("HADOOP_CLASSPATH", None)
+    call = subprocess.Popen(
+        ['beeline',
+         '--outputformat=csv2',
+         '-u', 'jdbc:hive2://' + pytest.config.option.hive_server2,
+         '-n', username or getuser(),
+         '-e', stmt],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        # Beeline in Hive 2.1 will read from stdin even when "-e"
+        # is specified; explicitly make sure there's nothing to
+        # read to avoid hanging, especially when running interactively
+        # with py.test.
+        stdin=file("/dev/null"),
+        env=env)
+    (stdout, stderr) = call.communicate()
+    call.wait()
+    if call.returncode != 0:
+      raise RuntimeError(stderr)
+    return stdout
 
   def hive_partition_names(self, table_name):
     """Find the names of the partitions of a table, as Hive sees them.