You are viewing a plain text version of this content. The canonical link for it is here.
Posted to hdfs-commits@hadoop.apache.org by el...@apache.org on 2012/07/18 17:42:19 UTC

svn commit: r1362985 - in /hadoop/common/branches/branch-2/hadoop-hdfs-project: ./ hadoop-hdfs/ hadoop-hdfs/src/main/java/ hadoop-hdfs/src/main/native/ hadoop-hdfs/src/main/webapps/datanode/ hadoop-hdfs/src/main/webapps/hdfs/ hadoop-hdfs/src/main/webap...

Author: eli
Date: Wed Jul 18 15:42:19 2012
New Revision: 1362985

URL: http://svn.apache.org/viewvc?rev=1362985&view=rev
Log:
HDFS-2966. TestNameNodeMetrics tests can fail under load. Contributed by Steve Loughran

Modified:
    hadoop/common/branches/branch-2/hadoop-hdfs-project/   (props changed)
    hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/   (props changed)
    hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
    hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/java/   (props changed)
    hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/native/   (props changed)
    hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/datanode/   (props changed)
    hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/   (props changed)
    hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/secondary/   (props changed)
    hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/test/hdfs/   (props changed)
    hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java

Propchange: hadoop/common/branches/branch-2/hadoop-hdfs-project/
------------------------------------------------------------------------------
  Merged /hadoop/common/trunk/hadoop-hdfs-project:r1298820

Propchange: hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/
------------------------------------------------------------------------------
  Merged /hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs:r1298820

Modified: hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt?rev=1362985&r1=1362984&r2=1362985&view=diff
==============================================================================
--- hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt (original)
+++ hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt Wed Jul 18 15:42:19 2012
@@ -348,6 +348,8 @@ Release 2.0.1-alpha - UNRELEASED
     HDFS-3609. libhdfs: don't force the URI to look like hdfs://hostname:port.
     (Colin Patrick McCabe via eli)
 
+    HDFS-2966 TestNameNodeMetrics tests can fail under load. (stevel)
+
   BREAKDOWN OF HDFS-3042 SUBTASKS
 
     HDFS-2185. HDFS portion of ZK-based FailoverController (todd)

Propchange: hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/java/
------------------------------------------------------------------------------
  Merged /hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/java:r1298820

Propchange: hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/native/
------------------------------------------------------------------------------
  Merged /hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/native:r1298820

Propchange: hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/datanode/
------------------------------------------------------------------------------
  Merged /hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/datanode:r1298820

Propchange: hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/
------------------------------------------------------------------------------
  Merged /hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs:r1298820

Propchange: hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/secondary/
------------------------------------------------------------------------------
  Merged /hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/secondary:r1298820

Propchange: hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/test/hdfs/
------------------------------------------------------------------------------
  Merged /hadoop/common/trunk/hadoop-hdfs-project/hadoop-hdfs/src/test/hdfs:r1298820

Modified: hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java?rev=1362985&r1=1362984&r2=1362985&view=diff
==============================================================================
--- hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java (original)
+++ hadoop/common/branches/branch-2/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java Wed Jul 18 15:42:19 2012
@@ -62,6 +62,8 @@ public class TestNameNodeMetrics {
   
   // Number of datanodes in the cluster
   private static final int DATANODE_COUNT = 3; 
+  private static final int WAIT_GAUGE_VALUE_RETRIES = 20;
+
   static {
     CONF.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 100);
     CONF.setInt(DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY, 1);
@@ -148,11 +150,8 @@ public class TestNameNodeMetrics {
     assertGauge("BlockCapacity", blockCapacity, rb);
     fs.delete(file, true);
     filesTotal--; // reduce the filecount for deleted file
-    
-    waitForDeletion();
-    updateMetrics();
-    rb = getMetrics(NS_METRICS);
-    assertGauge("FilesTotal", filesTotal, rb);
+
+    rb = waitForDnMetricValue(NS_METRICS, "FilesTotal", filesTotal);
     assertGauge("BlocksTotal", 0L, rb);
     assertGauge("PendingDeletionBlocks", 0L, rb);
 
@@ -185,9 +184,7 @@ public class TestNameNodeMetrics {
     assertGauge("PendingReplicationBlocks", 1L, rb);
     assertGauge("ScheduledReplicationBlocks", 1L, rb);
     fs.delete(file, true);
-    waitForDeletion();
-    rb = getMetrics(NS_METRICS);
-    assertGauge("CorruptBlocks", 0L, rb);
+    rb = waitForDnMetricValue(NS_METRICS, "CorruptBlocks", 0L);
     assertGauge("PendingReplicationBlocks", 0L, rb);
     assertGauge("ScheduledReplicationBlocks", 0L, rb);
   }
@@ -229,8 +226,7 @@ public class TestNameNodeMetrics {
     assertGauge("UnderReplicatedBlocks", 1L, rb);
     assertGauge("MissingBlocks", 1L, rb);
     fs.delete(file, true);
-    waitForDeletion();
-    assertGauge("UnderReplicatedBlocks", 0L, getMetrics(NS_METRICS));
+    waitForDnMetricValue(NS_METRICS, "UnderReplicatedBlocks", 0L);
   }
 
   private void waitForDeletion() throws InterruptedException {
@@ -238,7 +234,44 @@ public class TestNameNodeMetrics {
     // the blocks pending deletion are sent for deletion to the datanodes.
     Thread.sleep(DFS_REPLICATION_INTERVAL * (DATANODE_COUNT + 1) * 1000);
   }
-  
+
+  /**
+   * Wait for the named gauge value from the metrics source to reach the
+   * desired value.
+   *
+   * There's an initial delay then a spin cycle of sleep and poll. Because
+   * all the tests use a shared FS instance, these tests are not independent;
+   * that's why the initial sleep is in there.
+   *
+   * @param source metrics source
+   * @param name gauge name
+   * @param expected expected value
+   * @return the last metrics record polled
+   * @throws Exception if something went wrong.
+   */
+  private MetricsRecordBuilder waitForDnMetricValue(String source,
+                                                    String name,
+                                                    long expected)
+      throws Exception {
+    MetricsRecordBuilder rb;
+    long gauge;
+    //initial wait.
+    waitForDeletion();
+    //lots of retries are allowed for slow systems; fast ones will still
+    //exit early
+    int retries = (DATANODE_COUNT + 1) * WAIT_GAUGE_VALUE_RETRIES;
+    rb = getMetrics(source);
+    gauge = MetricsAsserts.getLongGauge(name, rb);
+    while (gauge != expected && (--retries > 0)) {
+      Thread.sleep(DFS_REPLICATION_INTERVAL * 500);
+      rb = getMetrics(source);
+      gauge = MetricsAsserts.getLongGauge(name, rb);
+    }
+    //at this point the assertion is valid or the retry count ran out
+    assertGauge(name, expected, rb);
+    return rb;
+  }
+
   @Test
   public void testRenameMetrics() throws Exception {
     Path src = getTestPath("src");