You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pulsar.apache.org by GitBox <gi...@apache.org> on 2022/07/18 03:00:54 UTC

[GitHub] [pulsar] poorbarcode commented on a diff in pull request #16428: [improve] [txn] [PIP-160] Txn buffered writer for transaction log batch

poorbarcode commented on code in PR #16428:
URL: https://github.com/apache/pulsar/pull/16428#discussion_r922944443


##########
pulsar-transaction/coordinator/src/test/java/org/apache/pulsar/transaction/coordinator/impl/TxnLogBufferedWriterTest.java:
##########
@@ -0,0 +1,502 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pulsar.transaction.coordinator.impl;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import io.netty.buffer.ByteBuf;
+import io.netty.buffer.Unpooled;
+import io.netty.util.concurrent.DefaultThreadFactory;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Random;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import lombok.Getter;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.bookkeeper.client.BKException;
+import org.apache.bookkeeper.common.util.OrderedExecutor;
+import org.apache.bookkeeper.mledger.AsyncCallbacks;
+import org.apache.bookkeeper.mledger.Entry;
+import org.apache.bookkeeper.mledger.ManagedCursor;
+import org.apache.bookkeeper.mledger.ManagedLedger;
+import org.apache.bookkeeper.mledger.ManagedLedgerException;
+import org.apache.bookkeeper.mledger.Position;
+import org.apache.bookkeeper.mledger.impl.PositionImpl;
+import org.apache.commons.collections4.CollectionUtils;
+import org.apache.pulsar.metadata.api.MetadataStoreException;
+import org.apache.pulsar.transaction.coordinator.test.MockedBookKeeperTestCase;
+import org.awaitility.Awaitility;
+import org.mockito.Mockito;
+import org.mockito.invocation.InvocationOnMock;
+import org.mockito.stubbing.Answer;
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+@Slf4j
+public class TxnLogBufferedWriterTest extends MockedBookKeeperTestCase {

Review Comment:
   > Please also create a test for the writer close behavior, check the state of the write, make sure the new operation will fail and the pending operations(haven't flush) will also fail.
   
   Yes, it is data provider case 5-1



##########
pulsar-transaction/coordinator/src/main/java/org/apache/pulsar/transaction/coordinator/impl/TxnLogBufferedWriter.java:
##########
@@ -0,0 +1,549 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pulsar.transaction.coordinator.impl;
+
+import io.netty.buffer.ByteBuf;
+import io.netty.buffer.Unpooled;
+import io.netty.util.Recycler;
+import java.io.Closeable;
+import java.util.ArrayList;
+import java.util.UUID;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ScheduledFuture;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;
+import lombok.Getter;
+import lombok.ToString;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.bookkeeper.common.util.OrderedExecutor;
+import org.apache.bookkeeper.mledger.AsyncCallbacks;
+import org.apache.bookkeeper.mledger.ManagedLedger;
+import org.apache.bookkeeper.mledger.ManagedLedgerException;
+import org.apache.bookkeeper.mledger.Position;
+import org.apache.commons.collections4.CollectionUtils;
+import org.apache.pulsar.common.allocator.PulsarByteBufAllocator;
+import org.apache.pulsar.common.util.collections.BitSetRecyclable;
+
+/***
+ * See PIP-160: https://github.com/apache/pulsar/issues/15516.
+ * Buffer requests and flush to Managed Ledger. Transaction Log Store And Pending Ack Store will no longer write to
+ * Managed Ledger directly, Change to using this class to write Ledger data.
+ * Caches “write requests” for a certain number or a certain size of request data and then writes them to the
+ * Managed Ledger in one go. After Managed Ledger has written complete, responds to each request-caller. In this
+ * process, Managed Ledger doesn't care how many records(or what to be written) in the Entry, it just treats them as
+ * a single block of data.
+ * The first write-request by transaction components will take a long time to receive a response, because we must wait
+ * for subsequent requests to accumulate enough data to actually start writing to the Managed Ledger. To control the
+ * maximum latency, we will mark the first request time for each batch, and additional timing triggers writes.
+ * You can enable or disabled the batch feature, will use Managed Ledger directly and without batching when disabled.
+ */
+@Slf4j
+public class TxnLogBufferedWriter<T> implements AsyncCallbacks.AddEntryCallback, Closeable {
+
+    public static final short BATCHED_ENTRY_DATA_PREFIX_MAGIC_NUMBER = 0x0e01;
+
+    public static final short BATCHED_ENTRY_DATA_PREFIX_VERSION = 1;
+
+    private static final ManagedLedgerException BUFFERED_WRITER_CLOSED_EXCEPTION =
+            new ManagedLedgerException.ManagedLedgerFencedException(
+                    new Exception("Transaction log buffered write has closed")
+            );
+
+    /**
+     * Enable or disabled the batch feature, will use Managed Ledger directly and without batching when disabled.
+     */
+    private final boolean batchEnabled;
+
+    private final ManagedLedger managedLedger;
+
+    /** All write operation will be executed on single thread. **/
+    private final ExecutorService singleThreadExecutorForWrite;
+
+    /** The serializer for the object which called by {@link #asyncAddData}. **/
+    private final DataSerializer<T> dataSerializer;
+
+    private ScheduledFuture<?> scheduledFuture;
+
+    /**
+     * Caches “write requests” for a certain for a certain number, if reach this threshold, will trig Bookie writes.
+     */
+    private final int batchedWriteMaxRecords;
+
+    /**
+     * Caches “write requests” for a certain size of request data, if reach this threshold, will trig Bookie writes.
+     */
+    private final int batchedWriteMaxSize;
+
+    /** Maximum delay for writing to bookie for the earliest request in the batch. **/
+    private final int batchedWriteMaxDelayInMillis;
+
+    /** Data cached in the current batch. Will reset to null after each batched writes. **/
+    private final ArrayList<T> dataArray;
+
+    /**
+     * Parameters of {@link #asyncAddData} cached in the current batch. Will create a new one after each batched writes.
+     */
+    private FlushContext flushContext;
+
+    /** Bytes size of data in current batch. Will reset to 0 after each batched writes. **/
+    private long bytesSize;
+
+    /** The main purpose of state maintenance is to prevent written after close. **/
+    private volatile State state;
+    private static final AtomicReferenceFieldUpdater<TxnLogBufferedWriter, TxnLogBufferedWriter.State> STATE_UPDATER =
+            AtomicReferenceFieldUpdater
+                    .newUpdater(TxnLogBufferedWriter.class, TxnLogBufferedWriter.State.class, "state");
+
+
+    /**
+     * Constructor.
+     * @param dataSerializer The serializer for the object which called by {@link #asyncAddData}.
+     * @param batchedWriteMaxRecords Caches “write requests” for a certain number, if reach this threshold, will trig
+     *                               Bookie writes.
+     * @param batchedWriteMaxSize Caches “write requests” for a certain size of request data, if reach this threshold,
+     *                           will trig Bookie writes.
+     * @param batchedWriteMaxDelayInMillis Maximum delay for writing to bookie for the earliest request in the batch.
+     * @param batchEnabled Enable or disabled the batch feature, will use Managed Ledger directly and without batching
+     *                    when disabled.
+     */
+    public TxnLogBufferedWriter(ManagedLedger managedLedger, OrderedExecutor orderedExecutor,
+                                ScheduledExecutorService scheduledExecutorService, DataSerializer<T> dataSerializer,
+                                int batchedWriteMaxRecords, int batchedWriteMaxSize, int batchedWriteMaxDelayInMillis,
+                                boolean batchEnabled){
+        this.batchEnabled = batchEnabled;
+        this.managedLedger = managedLedger;
+        this.singleThreadExecutorForWrite = orderedExecutor.chooseThread(
+                managedLedger.getName() == null ? UUID.randomUUID().toString() : managedLedger.getName());
+        this.dataSerializer = dataSerializer;
+        this.batchedWriteMaxRecords = batchedWriteMaxRecords;
+        this.batchedWriteMaxSize = batchedWriteMaxSize;
+        this.batchedWriteMaxDelayInMillis = batchedWriteMaxDelayInMillis;
+        this.flushContext = FlushContext.newInstance();
+        this.dataArray = new ArrayList<>();
+        // scheduler task.
+        if (batchEnabled) {
+            this.scheduledFuture = scheduledExecutorService.scheduleAtFixedRate(() -> trigFlush(false),
+                    batchedWriteMaxDelayInMillis, batchedWriteMaxDelayInMillis, TimeUnit.MICROSECONDS);
+        }
+        this.state = State.OPEN;
+    }
+
+    /**
+     * Append a new entry to the end of a managed ledger. All writes will be performed in the same thread. Callbacks are
+     * executed in strict write order,but after {@link #close()}, callbacks that fail by state check will execute
+     * earlier, and successful callbacks will not be affected.
+     * @param data data entry to be persisted.
+     * @param callback Will call {@link AddDataCallback#addComplete(Position, Object)} when
+     *                 add complete.
+     *                 Will call {@link AddDataCallback#addFailed(ManagedLedgerException, Object)} when
+     *                 add failure.
+     * @throws ManagedLedgerException
+     */
+    public void asyncAddData(T data, AddDataCallback callback, Object ctx){
+        if (!batchEnabled){
+            if (state == State.CLOSING || state == State.CLOSED){
+                callback.addFailed(BUFFERED_WRITER_CLOSED_EXCEPTION, ctx);
+                return;
+            }
+            ByteBuf byteBuf = dataSerializer.serialize(data);
+            managedLedger.asyncAddEntry(byteBuf, DisabledBatchCallback.INSTANCE,
+                    AsyncAddArgs.newInstance(callback, ctx, System.currentTimeMillis(), byteBuf));
+            return;
+        }
+        singleThreadExecutorForWrite.execute(() -> internalAsyncAddData(data, callback, ctx));
+    }
+
+    private void internalAsyncAddData(T data, AddDataCallback callback, Object ctx){
+        if (state == State.CLOSING || state == State.CLOSED){
+            callback.addFailed(BUFFERED_WRITER_CLOSED_EXCEPTION, ctx);
+            return;
+        }
+        int len = dataSerializer.getSerializedSize(data);
+        if (len >= batchedWriteMaxSize){
+            if (!flushContext.asyncAddArgsList.isEmpty()) {
+                doTrigFlush(true);
+            }
+            ByteBuf byteBuf = dataSerializer.serialize(data);
+            managedLedger.asyncAddEntry(byteBuf, DisabledBatchCallback.INSTANCE,
+                    AsyncAddArgs.newInstance(callback, ctx, System.currentTimeMillis(), byteBuf));
+            return;
+        }
+        // Add data.
+        this.dataArray.add(data);
+        // Add callback info.
+        AsyncAddArgs asyncAddArgs = AsyncAddArgs.newInstance(callback, ctx, System.currentTimeMillis());
+        this.flushContext.asyncAddArgsList.add(asyncAddArgs);
+        // Calculate bytes-size.
+        this.bytesSize += len;
+        // trig flush.
+        doTrigFlush(false);
+    }
+
+    /***
+     * The serializer for the object which called by {@link #asyncAddData}.
+     */
+    public interface DataSerializer<T>{
+
+        /**
+         * Calculate the number of bytes taken by {@param data} after serialization.
+         * @param data The object which called by {@link #asyncAddData}.
+         * @return The number of bytes taken after serialization.
+         */
+        int getSerializedSize(T data);
+
+        /**
+         * Serialize {@param data} to {@link ByteBuf}. The returned ByteBuf will be release once after writing to
+         * Bookie complete, and if you still need to use the ByteBuf, should call {@link ByteBuf#retain()} in
+         * {@link #serialize(Object)} implementation.
+         * @param data The object which called by {@link #asyncAddData}.
+         * @return byte buf.
+         */
+        ByteBuf serialize(T data);
+
+        /**
+         * Serialize {@param dataArray} to {@link ByteBuf}.. The returned ByteBuf will be release once after writing to
+         * Bookie complete, and if you still need to use the ByteBuf, should call {@link ByteBuf#retain()} in
+         * {@link #serialize(Object)} implementation.
+         * @param dataArray The objects which called by {@link #asyncAddData}.
+         * @return byte buf.
+         */
+        ByteBuf serialize(ArrayList<T> dataArray);
+
+    }
+
+    /**
+     * Trigger write to bookie once, If the conditions are not met, nothing will be done.
+     */
+    public void trigFlush(final boolean force){
+        singleThreadExecutorForWrite.execute(() -> doTrigFlush(force));
+    }
+
+    private void doTrigFlush(boolean force){
+        if (flushContext.asyncAddArgsList.isEmpty()) {
+            return;
+        }
+        if (force){
+            doFlush();
+            return;
+        }
+        AsyncAddArgs firstAsyncAddArgs = flushContext.asyncAddArgsList.get(0);
+        if (System.currentTimeMillis() - firstAsyncAddArgs.addedTime > batchedWriteMaxDelayInMillis){
+            doFlush();
+            return;
+        }
+        if (this.flushContext.asyncAddArgsList.size() >= batchedWriteMaxRecords){
+            doFlush();
+            return;
+        }
+        if (this.bytesSize >= batchedWriteMaxSize){
+            doFlush();
+        }
+    }
+
+    private void doFlush(){
+        // Combine data.
+        ByteBuf prefix = PulsarByteBufAllocator.DEFAULT.buffer(4);
+        prefix.writeShort(BATCHED_ENTRY_DATA_PREFIX_MAGIC_NUMBER);
+        prefix.writeShort(BATCHED_ENTRY_DATA_PREFIX_VERSION);
+        ByteBuf actualContent = this.dataSerializer.serialize(this.dataArray);
+        ByteBuf pairByteBuf = Unpooled.wrappedUnmodifiableBuffer(prefix, actualContent);
+        // We need to release this pairByteBuf after Managed ledger async add callback. Just holds by FlushContext.
+        this.flushContext.byteBuf = pairByteBuf;
+        // Flush.
+        if (State.CLOSING == state || State.CLOSED == state){
+            failureCallbackByContextAndRecycle(flushContext, BUFFERED_WRITER_CLOSED_EXCEPTION);
+        } else {
+            managedLedger.asyncAddEntry(pairByteBuf, this, this.flushContext);
+        }
+        // Clear buffers.ok
+        this.dataArray.clear();
+        this.flushContext = FlushContext.newInstance();
+        this.bytesSize = 0;
+    }
+
+    /**
+     * see {@link AsyncCallbacks.AddEntryCallback#addComplete(Position, ByteBuf, Object)}.
+     */
+    @Override
+    public void addComplete(Position position, ByteBuf entryData, Object ctx) {
+        final FlushContext flushContext = (FlushContext) ctx;
+        try {
+            final int batchSize = flushContext.asyncAddArgsList.size();
+            for (int batchIndex = 0; batchIndex < batchSize; batchIndex++) {
+                final AsyncAddArgs asyncAddArgs = flushContext.asyncAddArgsList.get(batchIndex);
+                BitSetRecyclable bitSetRecyclable = BitSetRecyclable.create();
+                bitSetRecyclable.set(batchIndex);
+                long[] ackSet = bitSetRecyclable.toLongArray();
+                bitSetRecyclable.recycle();
+                final TxnBatchedPositionImpl txnBatchedPosition = new TxnBatchedPositionImpl(position, batchSize,
+                        batchIndex, ackSet);
+                // Because this task already running at ordered task, so just "run".
+                try {
+                    asyncAddArgs.callback.addComplete(txnBatchedPosition, asyncAddArgs.ctx);
+                } catch (Exception e){
+                    log.error("After writing to the transaction batched log complete, the callback failed."
+                            + " managedLedger: " + managedLedger.getName(), e);
+                }
+            }
+        } finally {
+            flushContext.recycle();
+        }
+    }
+
+    /**
+     * see {@link AsyncCallbacks.AddEntryCallback#addFailed(ManagedLedgerException, Object)}.
+     */
+    @Override
+    public void addFailed(ManagedLedgerException exception, Object ctx) {
+        final FlushContext flushContext = (FlushContext) ctx;
+        failureCallbackByContextAndRecycle(flushContext, exception);
+    }
+
+    /**
+     * Cancel pending tasks and release resources.
+     */
+    @Override
+    public void close() {
+        // If disabled batch feature, there is no closing state.
+        if (!batchEnabled) {
+            STATE_UPDATER.compareAndSet(this, State.OPEN, State.CLOSED);
+            return;
+        }
+        // Prevent the reentrant.
+        if (!STATE_UPDATER.compareAndSet(this, State.OPEN, State.CLOSING)){
+            // Other thread also calling "close()".
+            return;
+        }
+        // Cancel pending tasks and release resources.
+        singleThreadExecutorForWrite.execute(() -> {
+            if (state == State.CLOSED){
+                return;
+            }
+            // Failure callback to pending request.
+            // If some request has been flushed, Bookie triggers the callback.
+            failureCallbackByContextAndRecycle(this.flushContext, BUFFERED_WRITER_CLOSED_EXCEPTION);
+            // Cancel task that schedule at fixed rate trig flush.
+            if (scheduledFuture != null && !scheduledFuture.isCancelled() && !scheduledFuture.isDone()) {
+                if (this.scheduledFuture.cancel(false)){
+                    this.state = State.CLOSED;
+                }
+            }
+            // Cancel task failure, The state will stay at CLOSING.
+            log.error("Cancel task that schedule at fixed rate trig flush failure. The state will stay at CLOSING."
+                    + " managedLedger: " + managedLedger.getName());
+        });

Review Comment:
   Already fixed.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@pulsar.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org