You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by ds...@apache.org on 2015/11/02 17:09:42 UTC
ambari git commit: AMBARI-13525 Exception in collector logs for JOIN
queries (dsen)
Repository: ambari
Updated Branches:
refs/heads/trunk 223247120 -> 646de5754
AMBARI-13525 Exception in collector logs for JOIN queries (dsen)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/646de575
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/646de575
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/646de575
Branch: refs/heads/trunk
Commit: 646de5754fd031270ed265cf7332a925ada24df2
Parents: 2232471
Author: Dmytro Sen <ds...@apache.org>
Authored: Mon Nov 2 18:08:47 2015 +0200
Committer: Dmytro Sen <ds...@apache.org>
Committed: Mon Nov 2 18:08:47 2015 +0200
----------------------------------------------------------------------
.../metrics/timeline/PhoenixHBaseAccessor.java | 22 ++++++++
.../timeline/query/PhoenixTransactSQL.java | 20 ++++++-
.../timeline/PhoenixHBaseAccessorTest.java | 58 ++++++++++++++++++--
.../timeline/TestPhoenixTransactSQL.java | 24 ++++++++
.../0.1.0/configuration/ams-hbase-site.xml | 22 +++++++-
.../stacks/HDP/2.0.6/services/stack_advisor.py | 4 ++
.../stacks/2.2/common/test_stack_advisor.py | 1 +
7 files changed, 145 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/ambari/blob/646de575/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java
----------------------------------------------------------------------
diff --git a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java
index 214cf1d..3ce30fd 100644
--- a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java
+++ b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java
@@ -21,6 +21,7 @@ import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.util.RetryCounter;
import org.apache.hadoop.hbase.util.RetryCounterFactory;
import org.apache.hadoop.metrics2.sink.timeline.Precision;
@@ -39,6 +40,7 @@ import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.
import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.query.PhoenixTransactSQL;
import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.query.SplitByMetricNamesCondition;
import org.apache.hadoop.yarn.util.timeline.TimelineUtils;
+import org.apache.phoenix.exception.PhoenixIOException;
import org.apache.phoenix.exception.SQLExceptionCode;
import org.codehaus.jackson.map.ObjectMapper;
import org.codehaus.jackson.type.TypeReference;
@@ -463,6 +465,26 @@ public class PhoenixHBaseAccessor {
}
}
+ } catch (PhoenixIOException pioe) {
+ Throwable pioe2 = pioe.getCause();
+ // Need to find out if this is exception "Could not find hash cache
+ // for joinId" or another PhoenixIOException
+ if (pioe2 instanceof PhoenixIOException &&
+ pioe2.getCause() instanceof DoNotRetryIOException) {
+ String className = null;
+ for (StackTraceElement ste : pioe2.getCause().getStackTrace()) {
+ className = ste.getClassName();
+ }
+
+ if (className != null && className.equals("HashJoinRegionScanner")) {
+ LOG.error("The cache might have expired and have been removed. Try to" +
+ " increase the cache size by setting bigger value for " +
+ "phoenix.coprocessor.maxMetaDataCacheSize in ams-hbase-site config." +
+ " Falling back to sort-merge join algorithm.");
+ PhoenixTransactSQL.setSortMergeJoinEnabled(true);
+ }
+ }
+ throw pioe;
} catch (RuntimeException ex) {
// We need to find out if this is a real IO exception
// or exception "maxStamp is smaller than minStamp"
http://git-wip-us.apache.org/repos/asf/ambari/blob/646de575/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/query/PhoenixTransactSQL.java
----------------------------------------------------------------------
diff --git a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/query/PhoenixTransactSQL.java b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/query/PhoenixTransactSQL.java
index 1ab92a0..092c983 100644
--- a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/query/PhoenixTransactSQL.java
+++ b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/query/PhoenixTransactSQL.java
@@ -165,7 +165,7 @@ public class PhoenixTransactSQL {
* Different queries for a number and a single hosts are used due to bug
* in Apache Phoenix
*/
- public static final String GET_LATEST_METRIC_SQL = "SELECT " +
+ public static final String GET_LATEST_METRIC_SQL = "SELECT %s " +
"E.METRIC_NAME AS METRIC_NAME, E.HOSTNAME AS HOSTNAME, " +
"E.APP_ID AS APP_ID, E.INSTANCE_ID AS INSTANCE_ID, " +
"E.SERVER_TIME AS SERVER_TIME, E.START_TIME AS START_TIME, " +
@@ -257,6 +257,7 @@ public class PhoenixTransactSQL {
public static final long NATIVE_TIME_RANGE_DELTA = 120000; // 2 minutes
public static final long HOUR = 3600000; // 1 hour
public static final long DAY = 86400000; // 1 day
+ private static boolean sortMergeJoinEnabled = false;
/**
* Filter to optimize HBase scan by using file timestamps. This prevents
@@ -268,6 +269,22 @@ public class PhoenixTransactSQL {
return String.format("/*+ NATIVE_TIME_RANGE(%s) */", (startTime - delta));
}
+ /**
+ * Falling back to sort merge join algorithm if default queries fail.
+ *
+ * @return Phoenix Hint String
+ */
+ public static String getLatestMetricsHints() {
+ if (sortMergeJoinEnabled) {
+ return "/*+ USE_SORT_MERGE_JOIN NO_CACHE */";
+ }
+ return "";
+ }
+
+ public static void setSortMergeJoinEnabled(boolean sortMergeJoinEnabled) {
+ PhoenixTransactSQL.sortMergeJoinEnabled = sortMergeJoinEnabled;
+ }
+
public static PreparedStatement prepareGetMetricsSqlStmt(Connection connection,
Condition condition) throws SQLException {
@@ -448,6 +465,7 @@ public class PhoenixTransactSQL {
stmtStr = condition.getStatement();
} else {
stmtStr = String.format(GET_LATEST_METRIC_SQL,
+ getLatestMetricsHints(),
METRICS_RECORD_TABLE_NAME,
METRICS_RECORD_TABLE_NAME,
condition.getConditionClause());
http://git-wip-us.apache.org/repos/asf/ambari/blob/646de575/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessorTest.java
----------------------------------------------------------------------
diff --git a/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessorTest.java b/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessorTest.java
index 2d48a58..0a6f120 100644
--- a/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessorTest.java
+++ b/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessorTest.java
@@ -18,6 +18,7 @@
package org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.metrics2.sink.timeline.Precision;
import org.apache.hadoop.metrics2.sink.timeline.TimelineMetrics;
import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.aggregators.Function;
@@ -25,6 +26,7 @@ import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.
import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.query.ConnectionProvider;
import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.query.DefaultCondition;
import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.query.PhoenixTransactSQL;
+import org.apache.phoenix.exception.PhoenixIOException;
import org.easymock.EasyMock;
import org.junit.Test;
import org.junit.runner.RunWith;
@@ -43,10 +45,8 @@ import java.util.List;
import java.util.Map;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
-/**
- * Created by user on 9/7/15.
- */
@RunWith(PowerMockRunner.class)
@PrepareForTest(PhoenixTransactSQL.class)
public class PhoenixHBaseAccessorTest {
@@ -96,7 +96,8 @@ public class PhoenixHBaseAccessorTest {
}
@Test
- public void testGetMetricRecordsException() throws SQLException, IOException {
+ public void testGetMetricRecordsIOException()
+ throws SQLException, IOException {
Configuration hbaseConf = new Configuration();
hbaseConf.setStrings(ZOOKEEPER_QUORUM, "quorum");
@@ -139,4 +140,53 @@ public class PhoenixHBaseAccessorTest {
EasyMock.verify(preparedStatementMock, rsMock, io, runtimeException);
}
+ @Test
+ public void testGetMetricRecordsPhoenixIOExceptionDoNotRetryException()
+ throws SQLException, IOException {
+
+ Configuration hbaseConf = new Configuration();
+ hbaseConf.setStrings(ZOOKEEPER_QUORUM, "quorum");
+ Configuration metricsConf = new Configuration();
+
+ ConnectionProvider connectionProvider = new ConnectionProvider() {
+ @Override
+ public Connection getConnection() throws SQLException {
+ return null;
+ }
+ };
+
+ PhoenixHBaseAccessor accessor = new PhoenixHBaseAccessor(hbaseConf, metricsConf, connectionProvider);
+
+ List<String> metricNames = new LinkedList<>();
+ List<String> hostnames = new LinkedList<>();
+ Map<String, List<Function>> metricFunctions = new HashMap<>();
+
+ PowerMock.mockStatic(PhoenixTransactSQL.class);
+ PreparedStatement preparedStatementMock = EasyMock.createNiceMock(PreparedStatement.class);
+ Condition condition = new DefaultCondition(metricNames, hostnames, "appid", "instanceid", null, null, Precision.SECONDS, 10, true);
+ EasyMock.expect(PhoenixTransactSQL.prepareGetLatestMetricSqlStmt(null, condition)).andReturn(preparedStatementMock).once();
+ PhoenixTransactSQL.setSortMergeJoinEnabled(true);
+ EasyMock.expectLastCall();
+ ResultSet rsMock = EasyMock.createNiceMock(ResultSet.class);
+ PhoenixIOException pioe1 = EasyMock.createNiceMock(PhoenixIOException.class);
+ PhoenixIOException pioe2 = EasyMock.createNiceMock(PhoenixIOException.class);
+ DoNotRetryIOException dnrioe = EasyMock.createNiceMock(DoNotRetryIOException.class);
+ EasyMock.expect(preparedStatementMock.executeQuery()).andThrow(pioe1);
+ EasyMock.expect(pioe1.getCause()).andReturn(pioe2).atLeastOnce();
+ EasyMock.expect(pioe2.getCause()).andReturn(dnrioe).atLeastOnce();
+ StackTraceElement stackTrace[] = new StackTraceElement[]{new StackTraceElement("HashJoinRegionScanner","method","file",1)};
+ EasyMock.expect(dnrioe.getStackTrace()).andReturn(stackTrace).atLeastOnce();
+
+
+ PowerMock.replayAll();
+ EasyMock.replay(preparedStatementMock, rsMock, pioe1, pioe2, dnrioe);
+ try {
+ TimelineMetrics tml = accessor.getMetricRecords(condition, metricFunctions);
+ fail();
+ } catch (Exception e) {
+ //NOP
+ }
+ PowerMock.verifyAll();
+ }
+
}
http://git-wip-us.apache.org/repos/asf/ambari/blob/646de575/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/TestPhoenixTransactSQL.java
----------------------------------------------------------------------
diff --git a/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/TestPhoenixTransactSQL.java b/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/TestPhoenixTransactSQL.java
index ec6a472..6bf15c7 100644
--- a/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/TestPhoenixTransactSQL.java
+++ b/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/TestPhoenixTransactSQL.java
@@ -382,6 +382,30 @@ public class TestPhoenixTransactSQL {
}
@Test
+ public void testPrepareGetLatestMetricSqlStmtSortMergeJoinAlgorithm()
+ throws SQLException {
+ Condition condition = new DefaultCondition(
+ Arrays.asList("cpu_user", "mem_free"), Arrays.asList("h1"),
+ "a1", "i1", null, null, null, null, false);
+ Connection connection = createNiceMock(Connection.class);
+ PreparedStatement preparedStatement = createNiceMock(PreparedStatement.class);
+ ParameterMetaData parameterMetaData = createNiceMock(ParameterMetaData.class);
+ Capture<String> stmtCapture = new Capture<String>();
+ expect(connection.prepareStatement(EasyMock.and(EasyMock.anyString(), EasyMock.capture(stmtCapture))))
+ .andReturn(preparedStatement);
+ expect(preparedStatement.getParameterMetaData())
+ .andReturn(parameterMetaData).anyTimes();
+ expect(parameterMetaData.getParameterCount())
+ .andReturn(6).anyTimes();
+
+ replay(connection, preparedStatement, parameterMetaData);
+ PhoenixTransactSQL.setSortMergeJoinEnabled(true);
+ PhoenixTransactSQL.prepareGetLatestMetricSqlStmt(connection, condition);
+ String stmt = stmtCapture.getValue();
+ Assert.assertTrue(stmt.contains("/*+ USE_SORT_MERGE_JOIN NO_CACHE */"));
+ }
+
+ @Test
public void testPrepareGetMetricsPrecisionHours() throws SQLException {
Condition condition = new DefaultCondition(
Arrays.asList("cpu_user", "mem_free"), Collections.singletonList("h1"),
http://git-wip-us.apache.org/repos/asf/ambari/blob/646de575/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/configuration/ams-hbase-site.xml
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/configuration/ams-hbase-site.xml b/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/configuration/ams-hbase-site.xml
index dfc2878..3f4a9d4 100644
--- a/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/configuration/ams-hbase-site.xml
+++ b/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/configuration/ams-hbase-site.xml
@@ -333,7 +333,27 @@
a normal table and would return items in rowkey order for scans
</description>
</property>
-
+ <property>
+ <name>phoenix.coprocessor.maxServerCacheTimeToLiveMs</name>
+ <value>60000</value>
+ <description>
+ Maximum living time (in milliseconds) of server caches. A cache entry
+ expires after this amount of time has passed since last access. Consider
+ adjusting this parameter when a server-side IOException(
+ “Could not find hash cache for joinId”) happens. Getting warnings like
+ “Earlier hash cache(s) might have expired on servers” might also be a
+ sign that this number should be increased.
+ </description>
+ </property>
+ <property>
+ <name>phoenix.coprocessor.maxMetaDataCacheSize</name>
+ <value>20480000</value>
+ <description>
+ Max size in bytes of total server-side metadata cache after which
+ evictions will begin to occur based on least recent access time.
+ Default is 20Mb
+ </description>
+ </property>
<property>
<name>dfs.client.read.shortcircuit</name>
<value>true</value>
http://git-wip-us.apache.org/repos/asf/ambari/blob/646de575/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py
index 68522bf..7181122 100644
--- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py
@@ -480,11 +480,15 @@ class HDP206StackAdvisor(DefaultStackAdvisor):
putAmsHbaseSiteProperty("hbase.regionserver.global.memstore.lowerLimit", 0.25)
putAmsHbaseSiteProperty("phoenix.query.maxGlobalMemoryPercentage", 20)
putTimelineServiceProperty("phoenix.query.maxGlobalMemoryPercentage", 30)
+ putAmsHbaseSiteProperty("phoenix.coprocessor.maxMetaDataCacheSize", 81920000)
elif total_sinks_count >= 500:
putAmsHbaseSiteProperty("hbase.regionserver.handler.count", 60)
putAmsHbaseSiteProperty("hbase.regionserver.hlog.blocksize", 134217728)
putAmsHbaseSiteProperty("hbase.regionserver.maxlogs", 64)
putAmsHbaseSiteProperty("hbase.hregion.memstore.flush.size", 268435456)
+ putAmsHbaseSiteProperty("phoenix.coprocessor.maxMetaDataCacheSize", 40960000)
+ else:
+ putAmsHbaseSiteProperty("phoenix.coprocessor.maxMetaDataCacheSize", 20480000)
pass
# Distributed mode heap size
http://git-wip-us.apache.org/repos/asf/ambari/blob/646de575/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py b/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py
index b38af78..a5a588f 100644
--- a/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py
+++ b/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py
@@ -1999,6 +1999,7 @@ class TestHDP22StackAdvisor(TestCase):
},
"ams-hbase-site": {
"properties": {
+ "phoenix.coprocessor.maxMetaDataCacheSize": "20480000",
"hbase.regionserver.global.memstore.lowerLimit": "0.3",
"hbase.regionserver.global.memstore.upperLimit": "0.35",
"hbase.hregion.memstore.flush.size": "134217728",