You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@hbase.apache.org by GitBox <gi...@apache.org> on 2019/11/14 14:23:45 UTC

[GitHub] [hbase] Apache9 commented on a change in pull request #820: HBASE-23286 Improve MTTR: Split WAL to HFile

Apache9 commented on a change in pull request #820: HBASE-23286 Improve MTTR: Split WAL to HFile
URL: https://github.com/apache/hbase/pull/820#discussion_r346339427
 
 

 ##########
 File path: hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALSplitToHFile.java
 ##########
 @@ -0,0 +1,418 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.wal;
+
+import static org.apache.hadoop.hbase.regionserver.wal.AbstractTestWALReplay.addRegionEdits;
+import static org.apache.hadoop.hbase.wal.RecoveredHFilesOutputSink.WAL_SPLIT_TO_HFILE;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+import static org.mockito.Mockito.when;
+
+import java.io.IOException;
+import java.security.PrivilegedExceptionAction;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.Cell;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.TableName;
+import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
+import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
+import org.apache.hadoop.hbase.client.Get;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.client.RegionInfoBuilder;
+import org.apache.hadoop.hbase.client.Result;
+import org.apache.hadoop.hbase.client.Scan;
+import org.apache.hadoop.hbase.client.TableDescriptor;
+import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
+import org.apache.hadoop.hbase.regionserver.DefaultStoreEngine;
+import org.apache.hadoop.hbase.regionserver.HRegion;
+import org.apache.hadoop.hbase.regionserver.RegionScanner;
+import org.apache.hadoop.hbase.regionserver.RegionServerServices;
+import org.apache.hadoop.hbase.regionserver.wal.AbstractTestWALReplay;
+import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
+import org.apache.hadoop.hbase.security.User;
+import org.apache.hadoop.hbase.testclassification.LargeTests;
+import org.apache.hadoop.hbase.testclassification.RegionServerTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.EnvironmentEdge;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hbase.util.FSUtils;
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.rules.TestName;
+import org.mockito.Mockito;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Category({ RegionServerTests.class, LargeTests.class })
+public class TestWALSplitToHFile {
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+      HBaseClassTestRule.forClass(TestWALSplitToHFile.class);
+
+  private static final Logger LOG = LoggerFactory.getLogger(AbstractTestWALReplay.class);
+  static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
+  private final EnvironmentEdge ee = EnvironmentEdgeManager.getDelegate();
+  private Path rootDir = null;
+  private String logName;
+  private Path oldLogDir;
+  private Path logDir;
+  private FileSystem fs;
+  private Configuration conf;
+  private WALFactory wals;
+
+  @Rule
+  public final TestName TEST_NAME = new TestName();
+
+  @BeforeClass
+  public static void setUpBeforeClass() throws Exception {
+    Configuration conf = UTIL.getConfiguration();
+    conf.setBoolean(WAL_SPLIT_TO_HFILE, true);
+    UTIL.startMiniCluster(3);
+    Path hbaseRootDir = UTIL.getDFSCluster().getFileSystem().makeQualified(new Path("/hbase"));
+    LOG.info("hbase.rootdir=" + hbaseRootDir);
+    FSUtils.setRootDir(conf, hbaseRootDir);
+  }
+
+  @AfterClass
+  public static void tearDownAfterClass() throws Exception {
+    UTIL.shutdownMiniCluster();
+  }
+
+  @Before
+  public void setUp() throws Exception {
+    this.conf = HBaseConfiguration.create(UTIL.getConfiguration());
+    this.fs = UTIL.getDFSCluster().getFileSystem();
+    this.rootDir = FSUtils.getRootDir(this.conf);
+    this.oldLogDir = new Path(this.rootDir, HConstants.HREGION_OLDLOGDIR_NAME);
+    String serverName =
+        ServerName.valueOf(TEST_NAME.getMethodName() + "-manual", 16010, System.currentTimeMillis())
+            .toString();
+    this.logName = AbstractFSWALProvider.getWALDirectoryName(serverName);
+    this.logDir = new Path(this.rootDir, logName);
+    if (UTIL.getDFSCluster().getFileSystem().exists(this.rootDir)) {
+      UTIL.getDFSCluster().getFileSystem().delete(this.rootDir, true);
+    }
+    this.wals = new WALFactory(conf, TEST_NAME.getMethodName());
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    this.wals.close();
+    UTIL.getDFSCluster().getFileSystem().delete(this.rootDir, true);
+  }
+
+  /*
+   * @param p Directory to cleanup
+   */
+  private void deleteDir(final Path p) throws IOException {
+    if (this.fs.exists(p)) {
+      if (!this.fs.delete(p, true)) {
+        throw new IOException("Failed remove of " + p);
+      }
+    }
+  }
+
+  private TableDescriptor createBasic3FamilyTD(final TableName tableName) throws IOException {
+    TableDescriptorBuilder builder = TableDescriptorBuilder.newBuilder(tableName);
+    builder.setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(Bytes.toBytes("a")).build());
+    builder.setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(Bytes.toBytes("b")).build());
+    builder.setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(Bytes.toBytes("c")).build());
+    TableDescriptor td = builder.build();
+    UTIL.getAdmin().createTable(td);
+    return td;
+  }
+
+  private WAL createWAL(Configuration c, Path hbaseRootDir, String logName) throws IOException {
+    FSHLog wal = new FSHLog(FileSystem.get(c), hbaseRootDir, logName, c);
+    wal.init();
+    return wal;
+  }
+
+  /**
+   * Test writing edits into an HRegion, closing it, splitting logs, opening
+   * Region again.  Verify seqids.
+   */
+  @Test
+  public void testReplayEditsWrittenViaHRegion()
+      throws IOException, SecurityException, IllegalArgumentException, InterruptedException {
+    final TableName tableName = TableName.valueOf(TEST_NAME.getMethodName());
+    final TableDescriptor td = createBasic3FamilyTD(tableName);
+    final RegionInfo ri = RegionInfoBuilder.newBuilder(tableName).build();
+    final Path basedir = FSUtils.getTableDir(this.rootDir, tableName);
+    deleteDir(basedir);
+    final byte[] rowName = tableName.getName();
+    final int countPerFamily = 10;
+
+    HRegion region3 = HBaseTestingUtility.createRegionAndWAL(ri, rootDir, this.conf, td);
+    HBaseTestingUtility.closeRegionAndWAL(region3);
+    // Write countPerFamily edits into the three families.  Do a flush on one
+    // of the families during the load of edits so its seqid is not same as
+    // others to test we do right thing when different seqids.
+    WAL wal = createWAL(this.conf, rootDir, logName);
+    HRegion region = HRegion.openHRegion(this.conf, this.fs, rootDir, ri, td, wal);
+    long seqid = region.getOpenSeqNum();
+    boolean first = true;
+    for (ColumnFamilyDescriptor cfd : td.getColumnFamilies()) {
+      addRegionEdits(rowName, cfd.getName(), countPerFamily, this.ee, region, "x");
+      if (first) {
+        // If first, so we have at least one family w/ different seqid to rest.
+        region.flush(true);
+        first = false;
+      }
+    }
+    // Now assert edits made it in.
+    final Get g = new Get(rowName);
+    Result result = region.get(g);
+    assertEquals(countPerFamily * td.getColumnFamilies().length, result.size());
+    // Now close the region (without flush), split the log, reopen the region and assert that
+    // replay of log has the correct effect, that our seqids are calculated correctly so
+    // all edits in logs are seen as 'stale'/old.
+    region.close(true);
+    wal.shutdown();
+    runWALSplit(this.conf);
+    WAL wal2 = createWAL(this.conf, rootDir, logName);
+    HRegion region2 = HRegion.openHRegion(conf, this.fs, rootDir, ri, td, wal2);
+    long seqid2 = region2.getOpenSeqNum();
+    assertTrue(seqid + result.size() < seqid2);
+    final Result result1b = region2.get(g);
+    assertEquals(result.size(), result1b.size());
+
+    // Next test.  Add more edits, then 'crash' this region by stealing its wal
+    // out from under it and assert that replay of the log adds the edits back
+    // correctly when region is opened again.
+    for (ColumnFamilyDescriptor hcd : td.getColumnFamilies()) {
+      addRegionEdits(rowName, hcd.getName(), countPerFamily, this.ee, region2, "y");
+    }
+    // Get count of edits.
+    final Result result2 = region2.get(g);
+    assertEquals(2 * result.size(), result2.size());
+    wal2.sync();
+    final Configuration newConf = HBaseConfiguration.create(this.conf);
+    User user = HBaseTestingUtility.getDifferentUser(newConf, tableName.getNameAsString());
+    user.runAs(new PrivilegedExceptionAction<Object>() {
+      @Override
+      public Object run() throws Exception {
+        runWALSplit(newConf);
+        FileSystem newFS = FileSystem.get(newConf);
+        // Make a new wal for new region open.
+        WAL wal3 = createWAL(newConf, rootDir, logName);
+        HRegion region3 = new HRegion(basedir, wal3, newFS, newConf, ri, td, null);
+        long seqid3 = region3.initialize();
+        Result result3 = region3.get(g);
+        // Assert that count of cells is same as before crash.
+        assertEquals(result2.size(), result3.size());
+
+        // I can't close wal1.  Its been appropriated when we split.
+        region3.close();
+        wal3.close();
+        return null;
+      }
+    });
+  }
+
+  /**
+   * Test that we recover correctly when there is a failure in between the
+   * flushes. i.e. Some stores got flushed but others did not.
+   * Unfortunately, there is no easy hook to flush at a store level. The way
+   * we get around this is by flushing at the region level, and then deleting
+   * the recently flushed store file for one of the Stores. This would put us
+   * back in the situation where all but that store got flushed and the region
+   * died.
+   * We restart Region again, and verify that the edits were replayed.
+   */
+  @Test
+  public void testReplayEditsAfterPartialFlush()
+      throws IOException, SecurityException, IllegalArgumentException {
+    final TableName tableName = TableName.valueOf(TEST_NAME.getMethodName());
+    final RegionInfo ri = RegionInfoBuilder.newBuilder(tableName).build();
+    final Path basedir = FSUtils.getTableDir(this.rootDir, tableName);
+    deleteDir(basedir);
+    final byte[] rowName = tableName.getName();
+    final int countPerFamily = 10;
+    final TableDescriptor td = createBasic3FamilyTD(tableName);
+    HRegion region3 = HBaseTestingUtility.createRegionAndWAL(ri, rootDir, this.conf, td);
+    HBaseTestingUtility.closeRegionAndWAL(region3);
+    // Write countPerFamily edits into the three families.  Do a flush on one
+    // of the families during the load of edits so its seqid is not same as
+    // others to test we do right thing when different seqids.
+    WAL wal = createWAL(this.conf, rootDir, logName);
+    HRegion region = HRegion.openHRegion(this.conf, this.fs, rootDir, ri, td, wal);
+    long seqid = region.getOpenSeqNum();
+    for (ColumnFamilyDescriptor cfd : td.getColumnFamilies()) {
+      addRegionEdits(rowName, cfd.getName(), countPerFamily, this.ee, region, "x");
+    }
+
+    // Now assert edits made it in.
+    final Get g = new Get(rowName);
+    Result result = region.get(g);
+    assertEquals(countPerFamily * td.getColumnFamilies().length, result.size());
+
+    // Let us flush the region
+    region.flush(true);
+    region.close(true);
+    wal.shutdown();
+
+    // delete the store files in the second column family to simulate a failure
+    // in between the flushcache();
+    // we have 3 families. killing the middle one ensures that taking the maximum
+    // will make us fail.
+    int cf_count = 0;
+    for (ColumnFamilyDescriptor cfd : td.getColumnFamilies()) {
+      cf_count++;
+      if (cf_count == 2) {
+        region.getRegionFileSystem().deleteFamily(cfd.getNameAsString());
+      }
+    }
+
+    // Let us try to split and recover
+    runWALSplit(this.conf);
+    WAL wal2 = createWAL(this.conf, rootDir, logName);
+    HRegion region2 = HRegion.openHRegion(this.conf, this.fs, rootDir, ri, td, wal2);
+    long seqid2 = region2.getOpenSeqNum();
+    assertTrue(seqid + result.size() < seqid2);
+
+    final Result result1b = region2.get(g);
+    assertEquals(result.size(), result1b.size());
+  }
+
+  /**
+   * Test that we could recover the data correctly after aborting flush. In the
+   * test, first we abort flush after writing some data, then writing more data
+   * and flush again, at last verify the data.
+   */
+  @Test
+  public void testReplayEditsAfterAbortingFlush() throws IOException {
+    final TableName tableName = TableName.valueOf(TEST_NAME.getMethodName());
+    final RegionInfo ri = RegionInfoBuilder.newBuilder(tableName).build();
+    final Path basedir = FSUtils.getTableDir(this.rootDir, tableName);
+    deleteDir(basedir);
+    final TableDescriptor td = createBasic3FamilyTD(tableName);
+    HRegion region3 = HBaseTestingUtility.createRegionAndWAL(ri, rootDir, this.conf, td);
+    HBaseTestingUtility.closeRegionAndWAL(region3);
+    // Write countPerFamily edits into the three families. Do a flush on one
+    // of the families during the load of edits so its seqid is not same as
+    // others to test we do right thing when different seqids.
+    WAL wal = createWAL(this.conf, rootDir, logName);
+    RegionServerServices rsServices = Mockito.mock(RegionServerServices.class);
+    Mockito.doReturn(false).when(rsServices).isAborted();
+    when(rsServices.getServerName()).thenReturn(ServerName.valueOf("foo", 10, 10));
+    when(rsServices.getConfiguration()).thenReturn(conf);
+    Configuration customConf = new Configuration(this.conf);
+    customConf.set(DefaultStoreEngine.DEFAULT_STORE_FLUSHER_CLASS_KEY,
+        AbstractTestWALReplay.CustomStoreFlusher.class.getName());
+    HRegion region = HRegion.openHRegion(this.rootDir, ri, td, wal, customConf, rsServices, null);
+    int writtenRowCount = 10;
+    List<ColumnFamilyDescriptor> families = Arrays.asList(td.getColumnFamilies());
+    for (int i = 0; i < writtenRowCount; i++) {
+      Put put = new Put(Bytes.toBytes(tableName + Integer.toString(i)));
+      put.addColumn(families.get(i % families.size()).getName(), Bytes.toBytes("q"),
+          Bytes.toBytes("val"));
+      region.put(put);
+    }
+
+    // Now assert edits made it in.
+    RegionScanner scanner = region.getScanner(new Scan());
+    assertEquals(writtenRowCount, getScannedCount(scanner));
+
+    // Let us flush the region
+    AbstractTestWALReplay.CustomStoreFlusher.throwExceptionWhenFlushing.set(true);
+    try {
+      region.flush(true);
+      fail("Injected exception hasn't been thrown");
+    } catch (IOException e) {
+      LOG.info("Expected simulated exception when flushing region, {}", e.getMessage());
+      // simulated to abort server
+      Mockito.doReturn(true).when(rsServices).isAborted();
+      region.setClosing(false); // region normally does not accept writes after
+      // DroppedSnapshotException. We mock around it for this test.
+    }
+    // writing more data
+    int moreRow = 10;
+    for (int i = writtenRowCount; i < writtenRowCount + moreRow; i++) {
+      Put put = new Put(Bytes.toBytes(tableName + Integer.toString(i)));
+      put.addColumn(families.get(i % families.size()).getName(), Bytes.toBytes("q"),
+          Bytes.toBytes("val"));
+      region.put(put);
+    }
+    writtenRowCount += moreRow;
+    // call flush again
+    AbstractTestWALReplay.CustomStoreFlusher.throwExceptionWhenFlushing.set(false);
+    try {
+      region.flush(true);
+    } catch (IOException t) {
+      LOG.info(
+          "Expected exception when flushing region because server is stopped," + t.getMessage());
+    }
+
+    region.close(true);
+    wal.shutdown();
+
+    // Let us try to split and recover
+    runWALSplit(this.conf);
+    WAL wal2 = createWAL(this.conf, rootDir, logName);
+    Mockito.doReturn(false).when(rsServices).isAborted();
+    HRegion region2 = HRegion.openHRegion(this.rootDir, ri, td, wal2, this.conf, rsServices, null);
+    scanner = region2.getScanner(new Scan());
+    assertEquals(writtenRowCount, getScannedCount(scanner));
+  }
+
+  private int getScannedCount(RegionScanner scanner) throws IOException {
+    int scannedCount = 0;
+    List<Cell> results = new ArrayList<>();
+    while (true) {
+      boolean existMore = scanner.next(results);
+      if (!results.isEmpty()) {
+        scannedCount++;
+      }
+      if (!existMore) {
+        break;
+      }
+      results.clear();
+    }
+    return scannedCount;
+  }
+
+  /*
+   * Run the split.  Verify only single split file made.
+   * @param c
 
 Review comment:
   Avoid empty parameter javadoc.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services