You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by du...@apache.org on 2023/04/13 15:21:38 UTC
[ozone] branch master updated: HDDS-8345. [Snapshot] Remove snapshot from SnapshotChainManager in case of failure (#4525)
This is an automated email from the ASF dual-hosted git repository.
duong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 8515c13a1e HDDS-8345. [Snapshot] Remove snapshot from SnapshotChainManager in case of failure (#4525)
8515c13a1e is described below
commit 8515c13a1eda56d16b531b67246e99b932c52318
Author: Hemant Kumar <he...@gmail.com>
AuthorDate: Thu Apr 13 08:21:31 2023 -0700
HDDS-8345. [Snapshot] Remove snapshot from SnapshotChainManager in case of failure (#4525)
---
.../ozone/rocksdiff/RocksDBCheckpointDiffer.java | 2 +-
.../hadoop/ozone/om/SnapshotChainManager.java | 2 +-
.../request/snapshot/OMSnapshotCreateRequest.java | 39 ++++++++++++++++++++--
3 files changed, 39 insertions(+), 4 deletions(-)
diff --git a/hadoop-hdds/rocksdb-checkpoint-differ/src/main/java/org/apache/ozone/rocksdiff/RocksDBCheckpointDiffer.java b/hadoop-hdds/rocksdb-checkpoint-differ/src/main/java/org/apache/ozone/rocksdiff/RocksDBCheckpointDiffer.java
index 11ef743a12..8c69770080 100644
--- a/hadoop-hdds/rocksdb-checkpoint-differ/src/main/java/org/apache/ozone/rocksdiff/RocksDBCheckpointDiffer.java
+++ b/hadoop-hdds/rocksdb-checkpoint-differ/src/main/java/org/apache/ozone/rocksdiff/RocksDBCheckpointDiffer.java
@@ -1062,7 +1062,7 @@ public class RocksDBCheckpointDiffer implements AutoCloseable {
} catch (RocksDBException e) {
LOG.warn("Can't get num of keys in SST '{}': {}", file, e.getMessage());
} catch (FileNotFoundException e) {
- LOG.info("Can't find SST '{}'", file, e);
+ LOG.info("Can't find SST '{}'", file);
}
CompactionNode fileNode = new CompactionNode(
file, snapshotID, numKeys, seqNum);
diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
index e57b7c9e9d..8dc6c703e0 100644
--- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
+++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
@@ -84,7 +84,7 @@ public class SnapshotChainManager {
if (prevGlobalID != null &&
!snapshotChainGlobal.containsKey(prevGlobalID)) {
throw new IOException("Snapshot Chain corruption: "
- + " previous snapshotID given but no associated snapshot "
+ + "previous snapshotID given but no associated snapshot "
+ "found in snapshot chain: SnapshotID "
+ prevGlobalID);
}
diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotCreateRequest.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotCreateRequest.java
index 6855ff476a..fb082bbe3e 100644
--- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotCreateRequest.java
+++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotCreateRequest.java
@@ -48,13 +48,13 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
+import java.util.Objects;
import java.util.UUID;
import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.FILE_ALREADY_EXISTS;
import static org.apache.hadoop.ozone.om.lock.OzoneManagerLock.Resource.BUCKET_LOCK;
import static org.apache.hadoop.ozone.om.lock.OzoneManagerLock.Resource.SNAPSHOT_LOCK;
-
/**
* Handles CreateSnapshot Request.
*/
@@ -185,6 +185,21 @@ public class OMSnapshotCreateRequest extends OMClientRequest {
omClientResponse = new OMSnapshotCreateResponse(
omResponse.build(), snapshotInfo);
} catch (IOException ex) {
+ // Remove snapshot from the SnapshotChainManager in case of any failure.
+ // It is possible that createSnapshot request fails after snapshot gets
+ // added to snapshot chain manager because couldn't add it to cache/DB.
+ // In that scenario, SnapshotChainManager#globalSnapshotId will point to
+ // failed createSnapshot request's snapshotId but in actual it doesn't
+ // exist in the SnapshotInfo table.
+ // If it doesn't get removed, OM restart will crash on
+ // SnapshotChainManager#loadFromSnapshotInfoTable because it could not
+ // find the previous snapshot which doesn't exist because it was never
+ // added to the SnapshotInfo table.
+ if (Objects.equals(snapshotInfo.getSnapshotID(),
+ snapshotChainManager.getLatestGlobalSnapshot())) {
+ removeSnapshotInfoFromSnapshotChainManager(snapshotChainManager,
+ snapshotInfo);
+ }
exception = ex;
omClientResponse = new OMSnapshotCreateResponse(
createErrorOMResponse(omResponse, exception));
@@ -215,5 +230,25 @@ public class OMSnapshotCreateRequest extends OMClientRequest {
}
return omClientResponse;
}
-
+
+ /**
+ * Removes the snapshot from the SnapshotChainManager.
+ * In case of any failure, it logs the exception as an error and swallow it.
+ * Ideally, there should not be any failure in deletion.
+ * If it happens, and we throw the exception, we lose the track why snapshot
+ * creation failed itself.
+ * Hence, to not lose that information it is better just log and swallow the
+ * exception.
+ */
+ private void removeSnapshotInfoFromSnapshotChainManager(
+ SnapshotChainManager snapshotChainManager,
+ SnapshotInfo info
+ ) {
+ try {
+ snapshotChainManager.deleteSnapshot(info);
+ } catch (IOException exception) {
+ LOG.error("Failed to remove snapshot: {} from SnapshotChainManager.",
+ info, exception);
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@ozone.apache.org
For additional commands, e-mail: commits-help@ozone.apache.org