You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ns...@apache.org on 2011/10/11 04:13:20 UTC

svn commit: r1181491 - in /hbase/branches/0.89/src: main/java/org/apache/hadoop/hbase/regionserver/metrics/ main/java/org/apache/hadoop/hbase/regionserver/wal/ test/java/org/apache/hadoop/hbase/regionserver/

Author: nspiegelberg
Date: Tue Oct 11 02:13:19 2011
New Revision: 1181491

URL: http://svn.apache.org/viewvc?rev=1181491&view=rev
Log:
Find outliers in HLog metrics

Summary:
Currently, the hlog metrics don't show the real min/max.  It
looks higher in OpenTSBD because you see avg for the RS with the highest
metric.  Refactor hlog code to give us actual min max data so we can
detect outliers.

Test Plan:
- mvn test -Dtest=TestHRegion
 - ./manage-v2-cluster.sh verify

DiffCamp Revision: 215561
Reviewed By: kannan
Reviewers: aaiyer, kannan, kranganathan
CC: kannan
Revert Plan:
OK

Modified:
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/metrics/RegionServerMetrics.java
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLog.java
    hbase/branches/0.89/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegion.java

Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/metrics/RegionServerMetrics.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/metrics/RegionServerMetrics.java?rev=1181491&r1=1181490&r2=1181491&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/metrics/RegionServerMetrics.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/metrics/RegionServerMetrics.java Tue Oct 11 02:13:19 2011
@@ -246,17 +246,12 @@ public class RegionServerMetrics impleme
       // }
       // Means you can't pass a numOps of zero or get a ArithmeticException / by zero.
       // HLog metrics
-      int ops = HLog.getWriteOps();
-      if (ops != 0) {
-        this.fsWriteLatency.inc(ops, HLog.getWriteTime());
-        this.fsWriteSize.inc(ops, HLog.getWriteSize());
-      }
-      ops = HLog.getSyncOps();
-      if (ops != 0) this.fsSyncLatency.inc(ops, HLog.getSyncTime());
-      ops = HLog.getGSyncOps();
-      if (ops != 0) this.fsGroupSyncLatency.inc(ops, HLog.getGSyncTime());
+      addHLogMetric(HLog.getWriteTime(), this.fsWriteLatency);
+      addHLogMetric(HLog.getWriteSize(), this.fsWriteSize);
+      addHLogMetric(HLog.getSyncTime(), this.fsSyncLatency);
+      addHLogMetric(HLog.getGSyncTime(), this.fsGroupSyncLatency);
       // HFile metrics
-      ops = HFile.getReadOps();
+      int ops = HFile.getReadOps();
       if (ops != 0) this.fsReadLatency.inc(ops, HFile.getReadTime());
       /* NOTE: removed HFile write latency.  2 reasons:
        * 1) Mixing HLog latencies are far higher priority since they're
@@ -278,6 +273,18 @@ public class RegionServerMetrics impleme
     this.metricsRecord.update();
   }
 
+  private void addHLogMetric(HLog.Metric logMetric,
+      MetricsTimeVaryingRate hadoopMetric) {
+    if (logMetric.count > 0)
+      hadoopMetric.inc(logMetric.min);
+    if (logMetric.count > 1)
+      hadoopMetric.inc(logMetric.max);
+    if (logMetric.count > 2) {
+      int ops = logMetric.count - 2;
+      hadoopMetric.inc(ops, logMetric.total - logMetric.max - logMetric.min);
+    }
+  }
+
   public void resetAllMinMax() {
     this.atomicIncrementTime.resetMinMax();
     this.fsReadLatency.resetMinMax();

Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLog.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLog.java?rev=1181491&r1=1181490&r2=1181491&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLog.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLog.java Tue Oct 11 02:13:19 2011
@@ -249,60 +249,59 @@ public class HLog implements Syncable {
     }
   }
 
+  public static class Metric {
+    public long min = Long.MAX_VALUE;
+    public long max = 0;
+    public long total = 0;
+    public int count = 0;
+
+    synchronized void inc(final long val) {
+      min = Math.min(min, val);
+      max = Math.max(max, val);
+      total += val;
+      ++count;
+    }
+
+    synchronized Metric get() {
+      Metric copy = new Metric();
+      copy.min = min;
+      copy.max = max;
+      copy.total = total;
+      copy.count = count;
+      this.min = Long.MAX_VALUE;
+      this.max = 0;
+      this.total = 0;
+      this.count = 0;
+      return copy;
+    }
+  }
+
   // For measuring latency of writes
-  private static volatile int writeOps = 0;
-  private static volatile long writeTime = 0;
-  private static volatile long writeSize = 0;
+  private static Metric writeTime = new Metric();
+  private static Metric writeSize = new Metric();
   // For measuring latency of syncs
-  private static volatile int syncOps = 0;
-  private static volatile long syncTime = 0;
-  private static volatile int gsyncOps = 0;
-  private static volatile long gsyncTime = 0;
+  private static Metric syncTime = new Metric();
+  private static Metric gsyncTime = new Metric();
 
   public static volatile long lastSplitTime = 0;
   public static volatile long lastSplitSize = 0;
 
-  public static int getWriteOps() {
-    int ret = writeOps;
-    writeOps = 0;
-    return ret;
-  }
-
-  public static long getWriteTime() {
-    long ret = writeTime;
-    writeTime = 0;
-    return ret;
-  }
-
-  public static long getWriteSize() {
-    long ret = writeSize;
-    writeSize = 0;
-    return ret;
+  public static Metric getWriteTime() {
+    return writeTime.get();
   }
 
-  public static int getSyncOps() {
-    int ret = syncOps;
-    syncOps = 0;
-    return ret;
+  public static Metric getWriteSize() {
+    return writeSize.get();
   }
 
-  public static long getSyncTime() {
-    long ret = syncTime;
-    syncTime = 0;
-    return ret;
+  public static Metric getSyncTime() {
+    return syncTime.get();
   }
 
-  public static int getGSyncOps() {
-    int ret = gsyncOps;
-    gsyncOps = 0;
-    return ret;
+  public static Metric getGSyncTime() {
+    return gsyncTime.get();
   }
 
-  public static long getGSyncTime() {
-    long ret = gsyncTime;
-    gsyncTime = 0;
-    return ret;
-  }
 
   /**
    * HLog creating with a null actions listener.
@@ -878,14 +877,12 @@ public class HLog implements Syncable {
       // Only count 1 row as an unflushed entry.
       txid = this.unflushedEntries.incrementAndGet();
     }
-    writeTime += System.currentTimeMillis() - start;
-    writeOps++;
+    writeTime.inc(System.currentTimeMillis() - start);
 
     // sync txn to file system
     start = System.currentTimeMillis();
     this.sync(info.isMetaRegion(), txid);
-    gsyncTime += System.currentTimeMillis() - start;
-    gsyncOps++;
+    gsyncTime.inc(System.currentTimeMillis() - start);
 
   }
 
@@ -1028,8 +1025,7 @@ public class HLog implements Syncable {
           long doneUpto = this.unflushedEntries.get();
           this.writer.sync();
           this.syncTillHere = doneUpto;
-          syncTime += System.currentTimeMillis() - now;
-          syncOps++;
+          syncTime.inc(System.currentTimeMillis() - now);
           this.forceSync = false;
 
           // if the number of replicas in HDFS has fallen below the initial
@@ -1114,7 +1110,7 @@ public class HLog implements Syncable {
       for(KeyValue kv : logEdit.getKeyValues()) {
         len += kv.getLength();
       }
-      writeSize += len;
+      writeSize.inc(len);
       if (took > 1000) {
         LOG.warn(String.format(
           "%s took %d ms appending an edit to hlog; editcount=%d, len~=%s",

Modified: hbase/branches/0.89/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegion.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegion.java?rev=1181491&r1=1181490&r2=1181491&view=diff
==============================================================================
--- hbase/branches/0.89/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegion.java (original)
+++ hbase/branches/0.89/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegion.java Tue Oct 11 02:13:19 2011
@@ -345,8 +345,8 @@ public class TestHRegion extends HBaseTe
     byte[] val = Bytes.toBytes("val");
     initHRegion(b, getName(), cf);
 
-    HLog.getSyncOps(); // clear counter from prior tests
-    assertEquals(0, HLog.getSyncOps());
+    HLog.getSyncTime(); // clear counter from prior tests
+    assertEquals(0, HLog.getSyncTime().count);
 
     LOG.info("First a batch put with all valid puts");
     final Put[] puts = new Put[10];
@@ -360,7 +360,7 @@ public class TestHRegion extends HBaseTe
     for (int i = 0; i < 10; i++) {
       assertEquals(OperationStatusCode.SUCCESS, codes[i]);
     }
-    assertEquals(1, HLog.getSyncOps());
+    assertEquals(1, HLog.getSyncTime().count);
 
     LOG.info("Next a batch put with one invalid family");
     puts[5].add(Bytes.toBytes("BAD_CF"), qual, val);
@@ -370,7 +370,7 @@ public class TestHRegion extends HBaseTe
       assertEquals((i == 5) ? OperationStatusCode.BAD_FAMILY :
         OperationStatusCode.SUCCESS, codes[i]);
     }
-    assertEquals(1, HLog.getSyncOps());
+    assertEquals(1, HLog.getSyncTime().count);
 
     LOG.info("Next a batch put that has to break into two batches to avoid a lock");
     Integer lockedRow = region.obtainRowLock(Bytes.toBytes("row_2"));
@@ -391,7 +391,7 @@ public class TestHRegion extends HBaseTe
 
     LOG.info("...waiting for put thread to sync first time");
     long startWait = System.currentTimeMillis();
-    while (HLog.getSyncOps() == 0) {
+    while (HLog.getSyncTime().count == 0) {
       Thread.sleep(100);
       if (System.currentTimeMillis() - startWait > 10000) {
         fail("Timed out waiting for thread to sync first minibatch");
@@ -402,7 +402,7 @@ public class TestHRegion extends HBaseTe
     LOG.info("...joining on thread");
     ctx.stop();
     LOG.info("...checking that next batch was synced");
-    assertEquals(1, HLog.getSyncOps());
+    assertEquals(1, HLog.getSyncTime().count);
     codes = retFromThread.get();
     for (int i = 0; i < 10; i++) {
       assertEquals((i == 5) ? OperationStatusCode.BAD_FAMILY :
@@ -426,7 +426,7 @@ public class TestHRegion extends HBaseTe
         OperationStatusCode.SUCCESS, codes[i]);
     }
     // Make sure we didn't do an extra batch
-    assertEquals(1, HLog.getSyncOps());
+    assertEquals(1, HLog.getSyncTime().count);
 
     // Make sure we still hold lock
     assertTrue(region.isRowLocked(lockedRow));