You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2021/02/05 08:55:45 UTC
[impala] branch master updated: IMPALA-10459: Remove workarounds
for MAPREDUCE-6441
This is an automated email from the ASF dual-hosted git repository.
stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new e71ea69 IMPALA-10459: Remove workarounds for MAPREDUCE-6441
e71ea69 is described below
commit e71ea69bb8e80efc5559309195c3b62b06015530
Author: stiga-huang <hu...@gmail.com>
AuthorDate: Thu Jan 28 16:26:31 2021 +0800
IMPALA-10459: Remove workarounds for MAPREDUCE-6441
MAPREDUCE-6441 is resolved and is in our toolchain. This patch removes
workarounds for it.
Tests:
- Ran exhaustive test.
Change-Id: I5c4d482a6d15cdc08e9cf8878e130399665a8ee0
Reviewed-on: http://gerrit.cloudera.org:8080/17011
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
bin/load-data.py | 22 +------
.../catalog/events/EventsProcessorStressTest.java | 2 -
.../apache/impala/util/RandomHiveQueryRunner.java | 7 --
tests/common/impala_test_suite.py | 77 ++++++++--------------
4 files changed, 28 insertions(+), 80 deletions(-)
diff --git a/bin/load-data.py b/bin/load-data.py
index a7eb883..ef40730 100755
--- a/bin/load-data.py
+++ b/bin/load-data.py
@@ -26,11 +26,9 @@ import logging
import multiprocessing
import os
import re
-import shutil
import sqlparse
import subprocess
import sys
-import tempfile
import time
import traceback
@@ -148,27 +146,9 @@ def exec_hive_query_from_file_beeline(file_name):
LOG.info("Beginning execution of hive SQL: {0}".format(file_name))
- # When HiveServer2 is configured to use "local" mode (i.e., MR jobs are run
- # in-process rather than on YARN), Hadoop's LocalDistributedCacheManager has a
- # race, wherein it tires to localize jars into
- # /tmp/hadoop-$USER/mapred/local/<millis>. Two simultaneous Hive queries
- # against HS2 can conflict here. Weirdly LocalJobRunner handles a similar issue
- # (with the staging directory) by appending a random number. To over come this,
- # in the case that HS2 is on the local machine (which we conflate with also
- # running MR jobs locally), we move the temporary directory into a unique
- # directory via configuration. This block can be removed when
- # https://issues.apache.org/jira/browse/MAPREDUCE-6441 is resolved.
- hive_args = HIVE_ARGS
- unique_dir = None
- if options.hive_hs2_hostport.startswith("localhost:"):
- unique_dir = tempfile.mkdtemp(prefix="hive-data-load-")
- hive_args += ' --hiveconf "mapreduce.cluster.local.dir=%s"' % unique_dir
-
output_file = file_name + ".log"
- hive_cmd = "{0} {1} -f {2}".format(HIVE_CMD, hive_args, file_name)
+ hive_cmd = "{0} {1} -f {2}".format(HIVE_CMD, HIVE_ARGS, file_name)
is_success = exec_cmd(hive_cmd, exit_on_error=False, out_file=output_file)
- if unique_dir:
- shutil.rmtree(unique_dir)
if is_success:
LOG.info("Finished execution of hive SQL: {0}".format(file_name))
diff --git a/fe/src/test/java/org/apache/impala/catalog/events/EventsProcessorStressTest.java b/fe/src/test/java/org/apache/impala/catalog/events/EventsProcessorStressTest.java
index 19da6fb..a340ed1 100644
--- a/fe/src/test/java/org/apache/impala/catalog/events/EventsProcessorStressTest.java
+++ b/fe/src/test/java/org/apache/impala/catalog/events/EventsProcessorStressTest.java
@@ -36,7 +36,6 @@ import org.apache.thrift.TException;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
-import org.junit.Ignore;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -189,7 +188,6 @@ public class EventsProcessorStressTest {
}
}
- @Ignore("Ignored until MAPREDUCE-6441 is available in the toolchain")
@Test
public void testUsingRandomHiveQueries() throws Exception {
LOG.info("Using number of clients: {} number of queries per client: {}", numClients_,
diff --git a/fe/src/test/java/org/apache/impala/util/RandomHiveQueryRunner.java b/fe/src/test/java/org/apache/impala/util/RandomHiveQueryRunner.java
index 1e6f91c..2ffed70 100644
--- a/fe/src/test/java/org/apache/impala/util/RandomHiveQueryRunner.java
+++ b/fe/src/test/java/org/apache/impala/util/RandomHiveQueryRunner.java
@@ -76,9 +76,6 @@ public class RandomHiveQueryRunner {
private final List<QueryType> skippedQueryTypes;
- // we need to add small delay between the start of each query to work-around
- // MAPREDUCE-6441
- private final Object delayLock_ = new Object();
/**
* Query type with weight. The weight of a QueryType determines the probability of its
* occurrence by the Random Query runner. Higher the weight, more the probability of its
@@ -1104,10 +1101,6 @@ public class RandomHiveQueryRunner {
try {
LOG.info("Client {} running hive query set {}: {}", clientId, queryNumber,
query);
- // add a delay between the start of each query to work around MAPREDUCE-6441
- synchronized (delayLock_) {
- Thread.sleep(10);
- }
query.run(hiveJdbcClientPool_);
queryNumber++;
} catch (Exception e) {
diff --git a/tests/common/impala_test_suite.py b/tests/common/impala_test_suite.py
index e5fcc70..dded47c 100644
--- a/tests/common/impala_test_suite.py
+++ b/tests/common/impala_test_suite.py
@@ -27,10 +27,8 @@ import pwd
import pytest
import re
import requests
-import shutil
import socket
import subprocess
-import tempfile
import time
from functools import wraps
from getpass import getuser
@@ -949,54 +947,33 @@ class ImpalaTestSuite(BaseTestSuite):
Run a statement in Hive, returning stdout if successful and throwing
RuntimeError(stderr) if not.
"""
- # When HiveServer2 is configured to use "local" mode (i.e., MR jobs are run
- # in-process rather than on YARN), Hadoop's LocalDistributedCacheManager has a
- # race, wherein it tires to localize jars into
- # /tmp/hadoop-$USER/mapred/local/<millis>. Two simultaneous Hive queries
- # against HS2 can conflict here. Weirdly LocalJobRunner handles a similar issue
- # (with the staging directory) by appending a random number. To overcome this,
- # in the case that HS2 is on the local machine (which we conflate with also
- # running MR jobs locally), we move the temporary directory into a unique
- # directory via configuration. This workaround can be removed when
- # https://issues.apache.org/jira/browse/MAPREDUCE-6441 is resolved.
- # A similar workaround is used in bin/load-data.py.
- tmpdir = None
- beeline_opts = []
- if pytest.config.option.hive_server2.startswith("localhost:"):
- tmpdir = tempfile.mkdtemp(prefix="impala-tests-")
- beeline_opts += ['--hiveconf', 'mapreduce.cluster.local.dir={0}'.format(tmpdir)]
- try:
- # Remove HADOOP_CLASSPATH from environment. Beeline doesn't need it,
- # and doing so avoids Hadoop 3's classpath de-duplication code from
- # placing $HADOOP_CONF_DIR too late in the classpath to get the right
- # log4j configuration file picked up. Some log4j configuration files
- # in Hadoop's jars send logging to stdout, confusing Impala's test
- # framework.
- env = os.environ.copy()
- env.pop("HADOOP_CLASSPATH", None)
- call = subprocess.Popen(
- ['beeline',
- '--outputformat=csv2',
- '-u', 'jdbc:hive2://' + pytest.config.option.hive_server2,
- '-n', username or getuser(),
- '-e', stmt] + beeline_opts,
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- # Beeline in Hive 2.1 will read from stdin even when "-e"
- # is specified; explicitly make sure there's nothing to
- # read to avoid hanging, especially when running interactively
- # with py.test.
- stdin=file("/dev/null"),
- env=env)
- (stdout, stderr) = call.communicate()
- call.wait()
- if call.returncode != 0:
- raise RuntimeError(stderr)
- return stdout
- finally:
- # IMPALA-8315: removing the directory may race with Hive's own cleanup and cause
- # errors.
- if tmpdir is not None: shutil.rmtree(tmpdir, ignore_errors=True)
+ # Remove HADOOP_CLASSPATH from environment. Beeline doesn't need it,
+ # and doing so avoids Hadoop 3's classpath de-duplication code from
+ # placing $HADOOP_CONF_DIR too late in the classpath to get the right
+ # log4j configuration file picked up. Some log4j configuration files
+ # in Hadoop's jars send logging to stdout, confusing Impala's test
+ # framework.
+ env = os.environ.copy()
+ env.pop("HADOOP_CLASSPATH", None)
+ call = subprocess.Popen(
+ ['beeline',
+ '--outputformat=csv2',
+ '-u', 'jdbc:hive2://' + pytest.config.option.hive_server2,
+ '-n', username or getuser(),
+ '-e', stmt],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ # Beeline in Hive 2.1 will read from stdin even when "-e"
+ # is specified; explicitly make sure there's nothing to
+ # read to avoid hanging, especially when running interactively
+ # with py.test.
+ stdin=file("/dev/null"),
+ env=env)
+ (stdout, stderr) = call.communicate()
+ call.wait()
+ if call.returncode != 0:
+ raise RuntimeError(stderr)
+ return stdout
def hive_partition_names(self, table_name):
"""Find the names of the partitions of a table, as Hive sees them.