You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by ds...@apache.org on 2015/11/02 17:09:42 UTC

ambari git commit: AMBARI-13525 Exception in collector logs for JOIN queries (dsen)

Repository: ambari
Updated Branches:
  refs/heads/trunk 223247120 -> 646de5754


AMBARI-13525 Exception in collector logs for JOIN queries (dsen)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/646de575
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/646de575
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/646de575

Branch: refs/heads/trunk
Commit: 646de5754fd031270ed265cf7332a925ada24df2
Parents: 2232471
Author: Dmytro Sen <ds...@apache.org>
Authored: Mon Nov 2 18:08:47 2015 +0200
Committer: Dmytro Sen <ds...@apache.org>
Committed: Mon Nov 2 18:08:47 2015 +0200

----------------------------------------------------------------------
 .../metrics/timeline/PhoenixHBaseAccessor.java  | 22 ++++++++
 .../timeline/query/PhoenixTransactSQL.java      | 20 ++++++-
 .../timeline/PhoenixHBaseAccessorTest.java      | 58 ++++++++++++++++++--
 .../timeline/TestPhoenixTransactSQL.java        | 24 ++++++++
 .../0.1.0/configuration/ams-hbase-site.xml      | 22 +++++++-
 .../stacks/HDP/2.0.6/services/stack_advisor.py  |  4 ++
 .../stacks/2.2/common/test_stack_advisor.py     |  1 +
 7 files changed, 145 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/646de575/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java
----------------------------------------------------------------------
diff --git a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java
index 214cf1d..3ce30fd 100644
--- a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java
+++ b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java
@@ -21,6 +21,7 @@ import org.apache.commons.lang.StringUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.DoNotRetryIOException;
 import org.apache.hadoop.hbase.util.RetryCounter;
 import org.apache.hadoop.hbase.util.RetryCounterFactory;
 import org.apache.hadoop.metrics2.sink.timeline.Precision;
@@ -39,6 +40,7 @@ import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.
 import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.query.PhoenixTransactSQL;
 import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.query.SplitByMetricNamesCondition;
 import org.apache.hadoop.yarn.util.timeline.TimelineUtils;
+import org.apache.phoenix.exception.PhoenixIOException;
 import org.apache.phoenix.exception.SQLExceptionCode;
 import org.codehaus.jackson.map.ObjectMapper;
 import org.codehaus.jackson.type.TypeReference;
@@ -463,6 +465,26 @@ public class PhoenixHBaseAccessor {
         }
       }
 
+    } catch (PhoenixIOException pioe) {
+      Throwable pioe2 = pioe.getCause();
+      // Need to find out if this is exception "Could not find hash cache
+      // for joinId" or another PhoenixIOException
+      if (pioe2 instanceof PhoenixIOException &&
+        pioe2.getCause() instanceof DoNotRetryIOException) {
+        String className = null;
+        for (StackTraceElement ste : pioe2.getCause().getStackTrace()) {
+          className = ste.getClassName();
+        }
+
+        if (className != null && className.equals("HashJoinRegionScanner")) {
+          LOG.error("The cache might have expired and have been removed. Try to" +
+            " increase the cache size by setting bigger value for " +
+            "phoenix.coprocessor.maxMetaDataCacheSize in ams-hbase-site config." +
+            " Falling back to sort-merge join algorithm.");
+          PhoenixTransactSQL.setSortMergeJoinEnabled(true);
+        }
+      }
+      throw pioe;
     } catch (RuntimeException ex) {
       // We need to find out if this is a real IO exception
       // or exception "maxStamp is smaller than minStamp"

http://git-wip-us.apache.org/repos/asf/ambari/blob/646de575/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/query/PhoenixTransactSQL.java
----------------------------------------------------------------------
diff --git a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/query/PhoenixTransactSQL.java b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/query/PhoenixTransactSQL.java
index 1ab92a0..092c983 100644
--- a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/query/PhoenixTransactSQL.java
+++ b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/query/PhoenixTransactSQL.java
@@ -165,7 +165,7 @@ public class PhoenixTransactSQL {
    * Different queries for a number and a single hosts are used due to bug
    * in Apache Phoenix
    */
-  public static final String GET_LATEST_METRIC_SQL = "SELECT " +
+  public static final String GET_LATEST_METRIC_SQL = "SELECT %s " +
     "E.METRIC_NAME AS METRIC_NAME, E.HOSTNAME AS HOSTNAME, " +
     "E.APP_ID AS APP_ID, E.INSTANCE_ID AS INSTANCE_ID, " +
     "E.SERVER_TIME AS SERVER_TIME, E.START_TIME AS START_TIME, " +
@@ -257,6 +257,7 @@ public class PhoenixTransactSQL {
   public static final long NATIVE_TIME_RANGE_DELTA = 120000; // 2 minutes
   public static final long HOUR = 3600000; // 1 hour
   public static final long DAY = 86400000; // 1 day
+  private static boolean sortMergeJoinEnabled = false;
 
   /**
    * Filter to optimize HBase scan by using file timestamps. This prevents
@@ -268,6 +269,22 @@ public class PhoenixTransactSQL {
     return String.format("/*+ NATIVE_TIME_RANGE(%s) */", (startTime - delta));
   }
 
+  /**
+   * Falling back to sort merge join algorithm if default queries fail.
+   *
+   * @return Phoenix Hint String
+   */
+  public static String getLatestMetricsHints() {
+    if (sortMergeJoinEnabled) {
+      return "/*+ USE_SORT_MERGE_JOIN NO_CACHE */";
+    }
+    return "";
+  }
+
+  public static void setSortMergeJoinEnabled(boolean sortMergeJoinEnabled) {
+    PhoenixTransactSQL.sortMergeJoinEnabled = sortMergeJoinEnabled;
+  }
+
   public static PreparedStatement prepareGetMetricsSqlStmt(Connection connection,
                                                            Condition condition) throws SQLException {
 
@@ -448,6 +465,7 @@ public class PhoenixTransactSQL {
       stmtStr = condition.getStatement();
     } else {
       stmtStr = String.format(GET_LATEST_METRIC_SQL,
+        getLatestMetricsHints(),
         METRICS_RECORD_TABLE_NAME,
         METRICS_RECORD_TABLE_NAME,
         condition.getConditionClause());

http://git-wip-us.apache.org/repos/asf/ambari/blob/646de575/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessorTest.java
----------------------------------------------------------------------
diff --git a/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessorTest.java b/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessorTest.java
index 2d48a58..0a6f120 100644
--- a/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessorTest.java
+++ b/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessorTest.java
@@ -18,6 +18,7 @@
 package org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.DoNotRetryIOException;
 import org.apache.hadoop.metrics2.sink.timeline.Precision;
 import org.apache.hadoop.metrics2.sink.timeline.TimelineMetrics;
 import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.aggregators.Function;
@@ -25,6 +26,7 @@ import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.
 import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.query.ConnectionProvider;
 import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.query.DefaultCondition;
 import org.apache.hadoop.yarn.server.applicationhistoryservice.metrics.timeline.query.PhoenixTransactSQL;
+import org.apache.phoenix.exception.PhoenixIOException;
 import org.easymock.EasyMock;
 import org.junit.Test;
 import org.junit.runner.RunWith;
@@ -43,10 +45,8 @@ import java.util.List;
 import java.util.Map;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
 
-/**
- * Created by user on 9/7/15.
- */
 @RunWith(PowerMockRunner.class)
 @PrepareForTest(PhoenixTransactSQL.class)
 public class PhoenixHBaseAccessorTest {
@@ -96,7 +96,8 @@ public class PhoenixHBaseAccessorTest {
   }
 
   @Test
-  public void testGetMetricRecordsException() throws SQLException, IOException {
+  public void testGetMetricRecordsIOException()
+    throws SQLException, IOException {
 
     Configuration hbaseConf = new Configuration();
     hbaseConf.setStrings(ZOOKEEPER_QUORUM, "quorum");
@@ -139,4 +140,53 @@ public class PhoenixHBaseAccessorTest {
     EasyMock.verify(preparedStatementMock, rsMock, io, runtimeException);
   }
 
+  @Test
+  public void testGetMetricRecordsPhoenixIOExceptionDoNotRetryException()
+    throws SQLException, IOException {
+
+    Configuration hbaseConf = new Configuration();
+    hbaseConf.setStrings(ZOOKEEPER_QUORUM, "quorum");
+    Configuration metricsConf = new Configuration();
+
+    ConnectionProvider connectionProvider = new ConnectionProvider() {
+      @Override
+      public Connection getConnection() throws SQLException {
+        return null;
+      }
+    };
+
+    PhoenixHBaseAccessor accessor = new PhoenixHBaseAccessor(hbaseConf, metricsConf, connectionProvider);
+
+    List<String> metricNames = new LinkedList<>();
+    List<String> hostnames = new LinkedList<>();
+    Map<String, List<Function>> metricFunctions = new HashMap<>();
+
+    PowerMock.mockStatic(PhoenixTransactSQL.class);
+    PreparedStatement preparedStatementMock = EasyMock.createNiceMock(PreparedStatement.class);
+    Condition condition = new DefaultCondition(metricNames, hostnames, "appid", "instanceid", null, null, Precision.SECONDS, 10, true);
+    EasyMock.expect(PhoenixTransactSQL.prepareGetLatestMetricSqlStmt(null, condition)).andReturn(preparedStatementMock).once();
+    PhoenixTransactSQL.setSortMergeJoinEnabled(true);
+    EasyMock.expectLastCall();
+    ResultSet rsMock = EasyMock.createNiceMock(ResultSet.class);
+    PhoenixIOException pioe1 = EasyMock.createNiceMock(PhoenixIOException.class);
+    PhoenixIOException pioe2 = EasyMock.createNiceMock(PhoenixIOException.class);
+    DoNotRetryIOException dnrioe = EasyMock.createNiceMock(DoNotRetryIOException.class);
+    EasyMock.expect(preparedStatementMock.executeQuery()).andThrow(pioe1);
+    EasyMock.expect(pioe1.getCause()).andReturn(pioe2).atLeastOnce();
+    EasyMock.expect(pioe2.getCause()).andReturn(dnrioe).atLeastOnce();
+    StackTraceElement stackTrace[] = new StackTraceElement[]{new StackTraceElement("HashJoinRegionScanner","method","file",1)};
+    EasyMock.expect(dnrioe.getStackTrace()).andReturn(stackTrace).atLeastOnce();
+
+
+    PowerMock.replayAll();
+    EasyMock.replay(preparedStatementMock, rsMock, pioe1, pioe2, dnrioe);
+    try {
+      TimelineMetrics tml = accessor.getMetricRecords(condition, metricFunctions);
+      fail();
+    } catch (Exception e) {
+      //NOP
+    }
+    PowerMock.verifyAll();
+  }
+
 }

http://git-wip-us.apache.org/repos/asf/ambari/blob/646de575/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/TestPhoenixTransactSQL.java
----------------------------------------------------------------------
diff --git a/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/TestPhoenixTransactSQL.java b/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/TestPhoenixTransactSQL.java
index ec6a472..6bf15c7 100644
--- a/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/TestPhoenixTransactSQL.java
+++ b/ambari-metrics/ambari-metrics-timelineservice/src/test/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/TestPhoenixTransactSQL.java
@@ -382,6 +382,30 @@ public class TestPhoenixTransactSQL {
   }
 
   @Test
+  public void testPrepareGetLatestMetricSqlStmtSortMergeJoinAlgorithm()
+    throws SQLException {
+    Condition condition = new DefaultCondition(
+      Arrays.asList("cpu_user", "mem_free"), Arrays.asList("h1"),
+      "a1", "i1", null, null, null, null, false);
+    Connection connection = createNiceMock(Connection.class);
+    PreparedStatement preparedStatement = createNiceMock(PreparedStatement.class);
+    ParameterMetaData parameterMetaData = createNiceMock(ParameterMetaData.class);
+    Capture<String> stmtCapture = new Capture<String>();
+    expect(connection.prepareStatement(EasyMock.and(EasyMock.anyString(), EasyMock.capture(stmtCapture))))
+        .andReturn(preparedStatement);
+    expect(preparedStatement.getParameterMetaData())
+      .andReturn(parameterMetaData).anyTimes();
+    expect(parameterMetaData.getParameterCount())
+      .andReturn(6).anyTimes();
+
+    replay(connection, preparedStatement, parameterMetaData);
+    PhoenixTransactSQL.setSortMergeJoinEnabled(true);
+    PhoenixTransactSQL.prepareGetLatestMetricSqlStmt(connection, condition);
+    String stmt = stmtCapture.getValue();
+    Assert.assertTrue(stmt.contains("/*+ USE_SORT_MERGE_JOIN NO_CACHE */"));
+  }
+
+  @Test
   public void testPrepareGetMetricsPrecisionHours() throws SQLException {
     Condition condition = new DefaultCondition(
       Arrays.asList("cpu_user", "mem_free"), Collections.singletonList("h1"),

http://git-wip-us.apache.org/repos/asf/ambari/blob/646de575/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/configuration/ams-hbase-site.xml
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/configuration/ams-hbase-site.xml b/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/configuration/ams-hbase-site.xml
index dfc2878..3f4a9d4 100644
--- a/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/configuration/ams-hbase-site.xml
+++ b/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/configuration/ams-hbase-site.xml
@@ -333,7 +333,27 @@
       a normal table and would return items in rowkey order for scans
     </description>
   </property>
-
+  <property>
+    <name>phoenix.coprocessor.maxServerCacheTimeToLiveMs</name>
+    <value>60000</value>
+    <description>
+      Maximum living time (in milliseconds) of server caches. A cache entry
+      expires after this amount of time has passed since last access. Consider
+      adjusting this parameter when a server-side IOException(
+      “Could not find hash cache for joinId”) happens. Getting warnings like
+      “Earlier hash cache(s) might have expired on servers” might also be a
+      sign that this number should be increased.
+    </description>
+  </property>
+  <property>
+    <name>phoenix.coprocessor.maxMetaDataCacheSize</name>
+    <value>20480000</value>
+    <description>
+      Max size in bytes of total server-side metadata cache after which
+      evictions will begin to occur based on least recent access time.
+      Default is 20Mb
+    </description>
+  </property>
   <property>
     <name>dfs.client.read.shortcircuit</name>
     <value>true</value>

http://git-wip-us.apache.org/repos/asf/ambari/blob/646de575/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py
index 68522bf..7181122 100644
--- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py
@@ -480,11 +480,15 @@ class HDP206StackAdvisor(DefaultStackAdvisor):
         putAmsHbaseSiteProperty("hbase.regionserver.global.memstore.lowerLimit", 0.25)
         putAmsHbaseSiteProperty("phoenix.query.maxGlobalMemoryPercentage", 20)
         putTimelineServiceProperty("phoenix.query.maxGlobalMemoryPercentage", 30)
+        putAmsHbaseSiteProperty("phoenix.coprocessor.maxMetaDataCacheSize", 81920000)
       elif total_sinks_count >= 500:
         putAmsHbaseSiteProperty("hbase.regionserver.handler.count", 60)
         putAmsHbaseSiteProperty("hbase.regionserver.hlog.blocksize", 134217728)
         putAmsHbaseSiteProperty("hbase.regionserver.maxlogs", 64)
         putAmsHbaseSiteProperty("hbase.hregion.memstore.flush.size", 268435456)
+        putAmsHbaseSiteProperty("phoenix.coprocessor.maxMetaDataCacheSize", 40960000)
+      else:
+        putAmsHbaseSiteProperty("phoenix.coprocessor.maxMetaDataCacheSize", 20480000)
       pass
 
     # Distributed mode heap size

http://git-wip-us.apache.org/repos/asf/ambari/blob/646de575/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py b/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py
index b38af78..a5a588f 100644
--- a/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py
+++ b/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py
@@ -1999,6 +1999,7 @@ class TestHDP22StackAdvisor(TestCase):
       },
       "ams-hbase-site": {
         "properties": {
+          "phoenix.coprocessor.maxMetaDataCacheSize": "20480000",
           "hbase.regionserver.global.memstore.lowerLimit": "0.3",
           "hbase.regionserver.global.memstore.upperLimit": "0.35",
           "hbase.hregion.memstore.flush.size": "134217728",