You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@bigtop.apache.org by co...@apache.org on 2014/12/15 16:57:50 UTC
bigtop git commit: BIGTOP-1560. Add a test case for performing block
corruption recovery
Repository: bigtop
Updated Branches:
refs/heads/master fce161888 -> e7646c67d
BIGTOP-1560. Add a test case for performing block corruption recovery
Signed-off-by: Konstantin Boudnik <co...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/bigtop/repo
Commit: http://git-wip-us.apache.org/repos/asf/bigtop/commit/e7646c67
Tree: http://git-wip-us.apache.org/repos/asf/bigtop/tree/e7646c67
Diff: http://git-wip-us.apache.org/repos/asf/bigtop/diff/e7646c67
Branch: refs/heads/master
Commit: e7646c67d18643cd87b20e31f4b2ef68c62b3156
Parents: fce1618
Author: Dasha Boudnik <da...@wandisco.com>
Authored: Mon Dec 15 07:57:31 2014 -0800
Committer: Konstantin Boudnik <co...@apache.org>
Committed: Mon Dec 15 07:57:31 2014 -0800
----------------------------------------------------------------------
bigtop-tests/test-artifacts/hadoop/pom.xml | 6 +
.../itest/hadoop/hdfs/TestBlockRecovery.groovy | 205 +++++++++++++++++++
2 files changed, 211 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/bigtop/blob/e7646c67/bigtop-tests/test-artifacts/hadoop/pom.xml
----------------------------------------------------------------------
diff --git a/bigtop-tests/test-artifacts/hadoop/pom.xml b/bigtop-tests/test-artifacts/hadoop/pom.xml
index 748a66c..315edfe 100644
--- a/bigtop-tests/test-artifacts/hadoop/pom.xml
+++ b/bigtop-tests/test-artifacts/hadoop/pom.xml
@@ -44,6 +44,12 @@
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
</dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-hdfs</artifactId>
+ <version>2.4.1</version>
+ <type>test-jar</type>
+ </dependency>
</dependencies>
</project>
http://git-wip-us.apache.org/repos/asf/bigtop/blob/e7646c67/bigtop-tests/test-artifacts/hadoop/src/main/groovy/org/apache/bigtop/itest/hadoop/hdfs/TestBlockRecovery.groovy
----------------------------------------------------------------------
diff --git a/bigtop-tests/test-artifacts/hadoop/src/main/groovy/org/apache/bigtop/itest/hadoop/hdfs/TestBlockRecovery.groovy b/bigtop-tests/test-artifacts/hadoop/src/main/groovy/org/apache/bigtop/itest/hadoop/hdfs/TestBlockRecovery.groovy
new file mode 100644
index 0000000..a75f016
--- /dev/null
+++ b/bigtop-tests/test-artifacts/hadoop/src/main/groovy/org/apache/bigtop/itest/hadoop/hdfs/TestBlockRecovery.groovy
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.bigtop.itest.hadoop.hdfs
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.fs.FileSystem
+import org.apache.hadoop.hdfs.DistributedFileSystem
+import org.apache.hadoop.hdfs.HdfsConfiguration
+import org.junit.Assume
+
+import static org.junit.Assert.assertNotNull
+import static org.junit.Assert.assertTrue;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.apache.bigtop.itest.shell.Shell;
+import org.apache.hadoop.hdfs.DFSTestUtil;
+
+
+/**
+ This test checks block recovery after a block is corrupted.
+ The test must be performed on a cluster with at least
+ three datanodes to allow block recovery.
+ The test must be run under user hdfs.
+ Block replication must be set to a minimum value of 2
+ for this test to work properly.
+ */
+public class TestBlockRecovery {
+
+ private static Shell sh = new Shell("/bin/bash");
+
+ private static Configuration conf;
+
+ private static final String corruptContent = "0123456789";
+ private static final String fsFilePath = USER_DIR + "/file0";
+ private static final String grepIP = "grep -o '\\[[^]]*\\]' | " +
+ "grep -o '[0-9]*\\.[0-9]*\\.[0-9]*\\.[0-9]*'";
+ private static final String localTestDir = "/tmp/test";
+ private static final String outputFile = localTestDir + "/fsckOutput.txt";
+ private static final String USER_DIR = "/user/hdfs";
+
+ private static final int sleepTime = 60 * 1000;
+ private static final int TIMEOUT = 5000;
+
+ private static String blockToTest;
+ private static String blockLocation;
+ private static String blockRecoveryNode;
+ private static String cksumError;
+ private static String initialBlockChecksum;
+ private static String fileContent;
+ private static String USERNAME;
+
+ private static def dataDirs = [];
+ private static def nodesBeforeRecovery = [];
+ private static def nodesAfterRecovery = [];
+
+ private static short numberOfDataNodes;
+ private static short repFactor;
+
+ private static final long fileLen = 10;
+ private static final long SEED = 0;
+
+ @BeforeClass
+ public static void setUp() {
+ /* Find datanode data directory, make file, add content, ensure replication
+ * is set to guarantee any chosen datanode will have block,
+ * get block and its location, perform checksum before corrupting block
+ * -- all on client side
+ */
+ conf = new HdfsConfiguration();
+ FileSystem fileSys = DistributedFileSystem.get(conf);
+ conf.addResource("hdfs-site.xml");
+ dataDirs = conf.get("dfs.data.dir").split(",");
+ if (dataDirs == null)
+ dataDirs = conf.get("dfs.datanode.data.dir").split(",");
+
+ USERNAME = System.getProperty("user.name");
+ Assume.assumeTrue(USERNAME == "hdfs");
+
+ numberOfDataNodes = sh.exec("hdfs dfsadmin -report | grep ^Name | wc -l").getOut()[0] as short;
+ Assume.assumeTrue(numberOfDataNodes >= 3);
+
+ sh.exec("rm -rf $localTestDir");
+ sh.exec("mkdir $localTestDir");
+ sh.exec("hadoop fs -rm -r $fsFilePath");
+ Thread.sleep(TIMEOUT);
+ sh.exec("hadoop fs -mkdir -p $USER_DIR");
+ assertTrue("Failed to create input directory", sh.getRet() == 0);
+
+ repFactor = (numberOfDataNodes - 1);
+
+ DFSTestUtil.createFile(fileSys, new Path(fsFilePath), fileLen, repFactor, SEED);
+
+ fileContent = sh.exec("hadoop fs -cat $fsFilePath").getOut()[0];
+
+ sh.exec("hdfs fsck $fsFilePath -blocks -locations -files > $outputFile");
+ assertTrue("Could not write output to file", sh.getRet() == 0);
+
+ nodesBeforeRecovery = sh.exec("grep -o '\\[[^]]*\\]' $outputFile | " +
+ "grep -o '[0-9]*\\.[0-9]*\\.[0-9]*\\.[0-9]*'").getOut();
+ assertTrue("Could not obtain datanode addresses", sh.getRet() == 0);
+
+ blockToTest = sh.exec("grep -o 'blk_[0-9]*' $outputFile").getOut()[0];
+ assertTrue("Could not obtain block number", sh.getRet() == 0);
+
+ for (int i=0; i < dataDirs.length; i++) {
+ def dataDir = dataDirs[i]
+ blockLocation = sh.exec("find $dataDir -name $blockToTest | grep $dataDir").getOut()[0];
+ if (blockLocation != null) break;
+ }
+ assertNotNull("Could not find specified block", blockLocation);
+
+ initialBlockChecksum = sh.exec("cksum $blockLocation").getOut()[0].split(" ")[0];
+ assertTrue("Could not obtain checksum for block $blockToTest", sh.getRet() == 0);
+ }
+
+ @AfterClass
+ public static void tearDown() {
+ // deletion of test files
+ sh.exec("hadoop fs -rm -r -skipTrash $fsFilePath");
+ assertTrue("Could not delete file $fsFilePath", sh.getRet() == 0);
+ sh.exec("rm -rf $localTestDir");
+ assertTrue("Could not delete test directory $localTestDir", sh.getRet() == 0);
+ }
+
+ @Test
+ public void testBlockRecovery() {
+ // corrupt block
+ sh.exec("echo $corruptContent > $blockLocation");
+ assertTrue("Could not write to file", sh.getRet() == 0);
+
+ // perform checksum after block corruption
+ String corruptBlockChecksum = sh.exec("cksum $blockLocation").getOut()[0].split(" ")[0];
+ assertTrue("Could not obtain checksum for block $blockToTest", sh.getRet() == 0);
+
+ // trigger block recovery by trying to access the file
+ sh.exec("hadoop fs -cat $fsFilePath");
+
+ // make sure checksum changes back to original, indicating block recovery
+ for (int j=0; j<3; j++) {
+ // wait a bit to let the block recover
+ sleep(sleepTime);
+ // see if checksum has changed
+ cksumError = sh.exec("hadoop fs -cat $fsFilePath | grep -o 'Checksum error'").getErr();
+ if (cksumError != "Checksum error") break;
+ }
+ assertNotNull ("Block has not been successfully triggered for recovery.", cksumError);
+
+ nodesAfterRecovery = sh.exec("hdfs fsck $fsFilePath -blocks -locations -files | $grepIP").getOut();
+ assertTrue("Could not obtain datanode addresses", sh.getRet() == 0);
+
+ blockRecoveryNode = (nodesBeforeRecovery.intersect(nodesAfterRecovery))[0];
+
+ if (blockRecoveryNode == null) {
+ sleep(sleepTime);
+
+ nodesAfterRecovery = sh.exec("hdfs fsck $fsFilePath -blocks -locations -files | $grepIP").getOut();
+ assertTrue("Could not obtain datanode addresses", sh.getRet() == 0);
+
+ blockRecoveryNode = (nodesBeforeRecovery.intersect(nodesAfterRecovery))[0];
+ assert (blockRecoveryNode.size() != 0) : "Block has not been successfully triggered for recovery."
+ }
+
+ int cksumAttempt;
+
+ boolean success = false;
+
+ // verify block has recovered. If not, give it a few more tries
+ while (cksumAttempt < 3) {
+ if (corruptBlockChecksum != initialBlockChecksum) {
+ sleep(sleepTime);
+ corruptBlockChecksum = sh.exec("ssh -o StrictHostKeyChecking=no -i ~/.ssh/id_hdfsuser " +
+ "$blockRecoveryNode 'cksum `find ${dataDirs.join(' ')}" +
+ " -name $blockToTest 2>/dev/null | grep $blockToTest` '").getOut()[0].split(" ")[0];
+ ++cksumAttempt;
+ } else {
+ // If block recovers, verify file content is the same as before corruption
+ if (sh.exec("hadoop fs -cat $fsFilePath").getOut()[0] == fileContent) {
+ assertTrue("Could not read file $fsFilePath", sh.getRet() == 0);
+ success = true;
+ break;
+ }
+ }
+ }
+ assertTrue("Block has not recovered", success);
+ }
+
+}
+