You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@ozone.apache.org by GitBox <gi...@apache.org> on 2021/07/21 01:07:09 UTC
[GitHub] [ozone] umamaheswararao commented on a change in pull request #2442: HDDS-5466. Refactor BlockOutputStream.

umamaheswararao commented on a change in pull request #2442:
URL: https://github.com/apache/ozone/pull/2442#discussion_r673589055



##########
File path: hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/RatisBlockOutputStream.java
##########
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.apache.hadoop.hdds.scm.storage;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.apache.hadoop.hdds.client.BlockID;
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.BlockData;
+import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ChunkInfo;
+import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandResponseProto;
+import org.apache.hadoop.hdds.scm.OzoneClientConfig;
+import org.apache.hadoop.hdds.scm.XceiverClientFactory;
+import org.apache.hadoop.hdds.scm.XceiverClientReply;
+import org.apache.hadoop.hdds.scm.XceiverClientSpi;
+import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
+import org.apache.hadoop.ozone.common.ChunkBuffer;
+import org.apache.hadoop.ozone.common.OzoneChecksumException;
+import org.apache.hadoop.security.token.Token;
+import org.apache.hadoop.security.token.TokenIdentifier;
+import org.apache.ratis.thirdparty.com.google.protobuf.ByteString;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.CompletionException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+
+/**
+ * An {@link OutputStream} used by the REST service in combination with the
+ * SCMClient to write the value of a key to a sequence
+ * of container chunks.  Writes are buffered locally and periodically written to
+ * the container as a new chunk.  In order to preserve the semantics that
+ * replacement of a pre-existing key is atomic, each instance of the stream has
+ * an internal unique identifier.  This unique identifier and a monotonically
+ * increasing chunk index form a composite key that is used as the chunk name.
+ * After all data is written, a putKey call creates or updates the corresponding
+ * container key, and this call includes the full list of chunks that make up
+ * the key data.  The list of chunks is updated all at once.  Therefore, a
+ * concurrent reader never can see an intermediate state in which different
+ * chunks of data from different versions of the key data are interleaved.
+ * This class encapsulates all state management for buffering and writing
+ * through to the container.
+ */
+public class RatisBlockOutputStream extends BlockOutputStream {
+  public static final Logger LOG = LoggerFactory.getLogger(
+      RatisBlockOutputStream.class);
+
+  private XceiverClientFactory xceiverClientFactory;
+  private XceiverClientSpi xceiverClient;
+
+  // This object will maintain the commitIndexes and byteBufferList in order
+  // Also, corresponding to the logIndex, the corresponding list of buffers will
+  // be released from the buffer pool.
+  private final CommitWatcher commitWatcher;
+  private final List<DatanodeDetails> failedServers = new ArrayList<>(0);
+
+  /**
+   * Creates a new BlockOutputStream.
+   *
+   * @param blockID              block ID
+   * @param xceiverClientManager client manager that controls client
+   * @param pipeline             pipeline where block will be written
+   * @param bufferPool           pool of buffers
+   */
+  public RatisBlockOutputStream(
+      Pipeline pipeline,
+      XceiverClientFactory xceiverClientManager,
+      BlockID blockID,
+      BufferPool bufferPool,
+      OzoneClientConfig config,
+      Token<? extends TokenIdentifier> token
+  ) throws IOException {
+    super(blockID, bufferPool, config, token);
+    this.xceiverClientFactory = xceiverClientManager;
+    this.xceiverClient = xceiverClientManager.acquireClient(pipeline);
+    this.commitWatcher = new CommitWatcher(bufferPool, xceiverClient);
+  }
+
+  @Override
+  public long getTotalAckDataLength() {
+    return commitWatcher.getTotalAckDataLength();
+  }
+
+  @VisibleForTesting
+  public XceiverClientSpi getXceiverClient() {
+    return xceiverClient;
+  }
+
+  @VisibleForTesting
+  public Map<Long, List<ChunkBuffer>> getCommitIndex2flushedDataMap() {
+    return commitWatcher.getCommitIndex2flushedDataMap();
+  }
+
+  @Override
+  void releaseBuffersOnException() {
+    commitWatcher.releaseBuffersOnException();
+  }
+
+  @Override
+  public List<DatanodeDetails> getFailedServers() {
+    return failedServers;
+  }
+
+  @Override
+  void watchForCommit(boolean bufferFull) throws IOException {
+    checkOpen();
+    try {
+      XceiverClientReply reply = bufferFull ?
+          commitWatcher.watchOnFirstIndex() : commitWatcher.watchOnLastIndex();
+      if (reply != null) {
+        List<DatanodeDetails> dnList = reply.getDatanodes();
+        if (!dnList.isEmpty()) {
+          Pipeline pipe = xceiverClient.getPipeline();
+
+          LOG.warn("Failed to commit BlockId {} on {}. Failed nodes: {}",
+              getBlockID(), pipe, dnList);
+          failedServers.addAll(dnList);
+        }
+      }
+    } catch (IOException ioe) {
+      setIoException(ioe);
+      throw getIoException();
+    }
+    refreshCurrentBuffer();
+  }
+
+  static class RatisPutBlockReply implements PutBlockReply {
+    private final XceiverClientReply reply;
+
+    RatisPutBlockReply(XceiverClientReply reply) {
+      this.reply = reply;
+    }
+
+    @Override
+    public CompletableFuture<ContainerCommandResponseProto> getResponse() {
+      return reply.getResponse();
+    }
+
+    long getLogIndex() {
+      return reply.getLogIndex();
+    }
+  }
+
+  @Override
+  PutBlockReply sendPutBlock(BlockData blockData, boolean close,
+      Token<? extends TokenIdentifier> token)
+      throws IOException, ExecutionException, InterruptedException {
+    final XceiverClientReply reply = ContainerProtocolCalls.putBlockAsync(
+        xceiverClient, blockData, close, token);
+    LOG.debug("Adding index {} commitMap size {}",
+        reply.getLogIndex(), commitWatcher.getCommitInfoMapSize());
+    return new RatisPutBlockReply(reply);
+  }
+
+  @Override
+  void updateCommitInfo(PutBlockReply reply, List<ChunkBuffer> byteBufferList) {
+    final long logIndex = ((RatisPutBlockReply)reply).getLogIndex();
+    commitWatcher.updateCommitInfoMap(logIndex, byteBufferList);
+  }
+
+  @Override
+  void putFlushFuture(long flushPos,
+      CompletableFuture<ContainerCommandResponseProto> flushFuture) {
+    commitWatcher.getFutureMap().put(flushPos, flushFuture);
+  }
+
+  @Override
+  void waitOnFlushFutures() throws InterruptedException, ExecutionException {
+    // wait for all the transactions to complete
+    CompletableFuture.allOf(commitWatcher.getFutureMap().values().toArray(
+        new CompletableFuture[0])).get();
+  }
+
+  @Override
+  void waitFullBuffer() throws ExecutionException, InterruptedException {
+    if (!commitWatcher.getFutureMap().isEmpty()) {
+      waitOnFlushFutures();
+    }
+  }
+
+  public void cleanup(boolean invalidateClient) {
+    if (xceiverClientFactory != null) {
+      xceiverClientFactory.releaseClient(xceiverClient, invalidateClient);
+    }
+    xceiverClientFactory = null;
+    xceiverClient = null;
+    commitWatcher.cleanup();
+    super.cleanup(invalidateClient);
+  }
+
+  public boolean isClosed() {
+    return xceiverClient == null;
+  }
+
+  @Override
+  boolean shouldFlush() {
+    return xceiverClientFactory != null && !isClosed();
+  }
+
+  /**
+   * Writes buffered data as a new chunk to the container and saves chunk
+   * information to be used later in putKey call.

Review comment:
       I think second point is about adding chunkinfo to containerBlockData. Since moved that part to invoking method, we may need to update this comment or moved the addchunkInfo here

##########
File path: hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/RatisBlockOutputStream.java
##########
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.apache.hadoop.hdds.scm.storage;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.apache.hadoop.hdds.client.BlockID;
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.BlockData;
+import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ChunkInfo;
+import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandResponseProto;
+import org.apache.hadoop.hdds.scm.OzoneClientConfig;
+import org.apache.hadoop.hdds.scm.XceiverClientFactory;
+import org.apache.hadoop.hdds.scm.XceiverClientReply;
+import org.apache.hadoop.hdds.scm.XceiverClientSpi;
+import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
+import org.apache.hadoop.ozone.common.ChunkBuffer;
+import org.apache.hadoop.ozone.common.OzoneChecksumException;
+import org.apache.hadoop.security.token.Token;
+import org.apache.hadoop.security.token.TokenIdentifier;
+import org.apache.ratis.thirdparty.com.google.protobuf.ByteString;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.CompletionException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+
+/**
+ * An {@link OutputStream} used by the REST service in combination with the
+ * SCMClient to write the value of a key to a sequence
+ * of container chunks.  Writes are buffered locally and periodically written to
+ * the container as a new chunk.  In order to preserve the semantics that
+ * replacement of a pre-existing key is atomic, each instance of the stream has
+ * an internal unique identifier.  This unique identifier and a monotonically
+ * increasing chunk index form a composite key that is used as the chunk name.
+ * After all data is written, a putKey call creates or updates the corresponding
+ * container key, and this call includes the full list of chunks that make up
+ * the key data.  The list of chunks is updated all at once.  Therefore, a
+ * concurrent reader never can see an intermediate state in which different
+ * chunks of data from different versions of the key data are interleaved.
+ * This class encapsulates all state management for buffering and writing
+ * through to the container.
+ */
+public class RatisBlockOutputStream extends BlockOutputStream {
+  public static final Logger LOG = LoggerFactory.getLogger(
+      RatisBlockOutputStream.class);
+
+  private XceiverClientFactory xceiverClientFactory;
+  private XceiverClientSpi xceiverClient;
+
+  // This object will maintain the commitIndexes and byteBufferList in order
+  // Also, corresponding to the logIndex, the corresponding list of buffers will
+  // be released from the buffer pool.
+  private final CommitWatcher commitWatcher;
+  private final List<DatanodeDetails> failedServers = new ArrayList<>(0);
+
+  /**
+   * Creates a new BlockOutputStream.
+   *
+   * @param blockID              block ID
+   * @param xceiverClientManager client manager that controls client
+   * @param pipeline             pipeline where block will be written
+   * @param bufferPool           pool of buffers
+   */
+  public RatisBlockOutputStream(
+      Pipeline pipeline,
+      XceiverClientFactory xceiverClientManager,
+      BlockID blockID,
+      BufferPool bufferPool,
+      OzoneClientConfig config,
+      Token<? extends TokenIdentifier> token
+  ) throws IOException {
+    super(blockID, bufferPool, config, token);
+    this.xceiverClientFactory = xceiverClientManager;
+    this.xceiverClient = xceiverClientManager.acquireClient(pipeline);
+    this.commitWatcher = new CommitWatcher(bufferPool, xceiverClient);
+  }
+
+  @Override
+  public long getTotalAckDataLength() {
+    return commitWatcher.getTotalAckDataLength();
+  }
+
+  @VisibleForTesting
+  public XceiverClientSpi getXceiverClient() {
+    return xceiverClient;
+  }
+
+  @VisibleForTesting
+  public Map<Long, List<ChunkBuffer>> getCommitIndex2flushedDataMap() {
+    return commitWatcher.getCommitIndex2flushedDataMap();
+  }
+
+  @Override
+  void releaseBuffersOnException() {
+    commitWatcher.releaseBuffersOnException();
+  }
+
+  @Override
+  public List<DatanodeDetails> getFailedServers() {
+    return failedServers;
+  }
+
+  @Override
+  void watchForCommit(boolean bufferFull) throws IOException {
+    checkOpen();
+    try {
+      XceiverClientReply reply = bufferFull ?
+          commitWatcher.watchOnFirstIndex() : commitWatcher.watchOnLastIndex();
+      if (reply != null) {
+        List<DatanodeDetails> dnList = reply.getDatanodes();
+        if (!dnList.isEmpty()) {
+          Pipeline pipe = xceiverClient.getPipeline();
+
+          LOG.warn("Failed to commit BlockId {} on {}. Failed nodes: {}",
+              getBlockID(), pipe, dnList);
+          failedServers.addAll(dnList);
+        }
+      }
+    } catch (IOException ioe) {
+      setIoException(ioe);
+      throw getIoException();
+    }
+    refreshCurrentBuffer();
+  }
+
+  static class RatisPutBlockReply implements PutBlockReply {
+    private final XceiverClientReply reply;
+
+    RatisPutBlockReply(XceiverClientReply reply) {
+      this.reply = reply;
+    }
+
+    @Override
+    public CompletableFuture<ContainerCommandResponseProto> getResponse() {
+      return reply.getResponse();
+    }
+
+    long getLogIndex() {
+      return reply.getLogIndex();
+    }
+  }
+
+  @Override
+  PutBlockReply sendPutBlock(BlockData blockData, boolean close,
+      Token<? extends TokenIdentifier> token)
+      throws IOException, ExecutionException, InterruptedException {
+    final XceiverClientReply reply = ContainerProtocolCalls.putBlockAsync(
+        xceiverClient, blockData, close, token);
+    LOG.debug("Adding index {} commitMap size {}",
+        reply.getLogIndex(), commitWatcher.getCommitInfoMapSize());
+    return new RatisPutBlockReply(reply);
+  }
+
+  @Override
+  void updateCommitInfo(PutBlockReply reply, List<ChunkBuffer> byteBufferList) {
+    final long logIndex = ((RatisPutBlockReply)reply).getLogIndex();
+    commitWatcher.updateCommitInfoMap(logIndex, byteBufferList);
+  }
+
+  @Override
+  void putFlushFuture(long flushPos,
+      CompletableFuture<ContainerCommandResponseProto> flushFuture) {
+    commitWatcher.getFutureMap().put(flushPos, flushFuture);
+  }
+
+  @Override
+  void waitOnFlushFutures() throws InterruptedException, ExecutionException {
+    // wait for all the transactions to complete
+    CompletableFuture.allOf(commitWatcher.getFutureMap().values().toArray(
+        new CompletableFuture[0])).get();
+  }
+
+  @Override
+  void waitFullBuffer() throws ExecutionException, InterruptedException {
+    if (!commitWatcher.getFutureMap().isEmpty()) {
+      waitOnFlushFutures();
+    }
+  }
+
+  public void cleanup(boolean invalidateClient) {
+    if (xceiverClientFactory != null) {
+      xceiverClientFactory.releaseClient(xceiverClient, invalidateClient);
+    }
+    xceiverClientFactory = null;
+    xceiverClient = null;
+    commitWatcher.cleanup();
+    super.cleanup(invalidateClient);
+  }
+
+  public boolean isClosed() {
+    return xceiverClient == null;
+  }
+
+  @Override
+  boolean shouldFlush() {
+    return xceiverClientFactory != null && !isClosed();
+  }
+
+  /**
+   * Writes buffered data as a new chunk to the container and saves chunk
+   * information to be used later in putKey call.
+   *
+   * @throws IOException if there is an I/O error while performing the call
+   * @throws OzoneChecksumException if there is an error while computing
+   * checksum
+   */

Review comment:
       Do you think we have Ratis specific logic here? What do you think if we move this to BlockOutputStream?
   Actual write is depends on xceiverClient anyway ( whether it's Ratis or Standalone)
   
   Even for EC we tried to use the same writeChunkToContainer API. 

##########
File path: hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/BlockOutputStream.java
##########
@@ -503,9 +453,16 @@ private void writeChunk(ChunkBuffer buffer)
       bufferList = new ArrayList<>();
     }
     bufferList.add(buffer);
-    writeChunkToContainer(buffer.duplicate(0, buffer.position()));
+    final ChunkBuffer duplicated = buffer.duplicate(0, buffer.position());
+    final ChunkInfo chunk = writeChunkToContainer(

Review comment:
       When we override write API, the extended BOS need to complement their own way of writing, but they need a way to add the chunkInfo to containerBlockData as it's used in executePutBlock. Should we expose abstract addChunks method to update the chunkinfo into containerBlockData object?
   
   Ex: for ECBOS, we decided to skip the buffered writing. So, overridden the write API and used writeChunkToContainer API to write directly to container. Here we also overridden putBlock call and just added getContainerBlock data API to get the structure. 
   Please check if there is better idea though.
   
   

##########
File path: hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/RatisBlockOutputStream.java
##########
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.apache.hadoop.hdds.scm.storage;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.apache.hadoop.hdds.client.BlockID;
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.BlockData;
+import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ChunkInfo;
+import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandResponseProto;
+import org.apache.hadoop.hdds.scm.OzoneClientConfig;
+import org.apache.hadoop.hdds.scm.XceiverClientFactory;
+import org.apache.hadoop.hdds.scm.XceiverClientReply;
+import org.apache.hadoop.hdds.scm.XceiverClientSpi;
+import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
+import org.apache.hadoop.ozone.common.ChunkBuffer;
+import org.apache.hadoop.ozone.common.OzoneChecksumException;
+import org.apache.hadoop.security.token.Token;
+import org.apache.hadoop.security.token.TokenIdentifier;
+import org.apache.ratis.thirdparty.com.google.protobuf.ByteString;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.CompletionException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+
+/**
+ * An {@link OutputStream} used by the REST service in combination with the
+ * SCMClient to write the value of a key to a sequence
+ * of container chunks.  Writes are buffered locally and periodically written to
+ * the container as a new chunk.  In order to preserve the semantics that
+ * replacement of a pre-existing key is atomic, each instance of the stream has
+ * an internal unique identifier.  This unique identifier and a monotonically
+ * increasing chunk index form a composite key that is used as the chunk name.
+ * After all data is written, a putKey call creates or updates the corresponding
+ * container key, and this call includes the full list of chunks that make up
+ * the key data.  The list of chunks is updated all at once.  Therefore, a
+ * concurrent reader never can see an intermediate state in which different
+ * chunks of data from different versions of the key data are interleaved.
+ * This class encapsulates all state management for buffering and writing
+ * through to the container.
+ */
+public class RatisBlockOutputStream extends BlockOutputStream {
+  public static final Logger LOG = LoggerFactory.getLogger(
+      RatisBlockOutputStream.class);
+
+  private XceiverClientFactory xceiverClientFactory;
+  private XceiverClientSpi xceiverClient;

Review comment:
       I think we have exceiverClientRatis and exceiverClientGRPC.
   So, currently both can set into RatisBlockOutputStream?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@ozone.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@ozone.apache.org
For additional commands, e-mail: issues-help@ozone.apache.org