You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@flink.apache.org by GitBox <gi...@apache.org> on 2022/07/07 16:29:33 UTC

[GitHub] [flink] rkhachatryan commented on a diff in pull request #20152: [FLINK-27155][changelog] Reduce multiple reads to the same Changelog …

rkhachatryan commented on code in PR #20152:
URL: https://github.com/apache/flink/pull/20152#discussion_r916034348


##########
flink-dstl/flink-dstl-dfs/src/main/java/org/apache/flink/changelog/fs/StateChangeIteratorWithCache.java:
##########
@@ -0,0 +1,367 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.changelog.fs;
+
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.ConfigurationUtils;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.core.memory.DataInputViewStreamWrapper;
+import org.apache.flink.runtime.state.StreamStateHandle;
+import org.apache.flink.runtime.state.changelog.StateChange;
+import org.apache.flink.runtime.state.filesystem.FileStateHandle;
+import org.apache.flink.util.CloseableIterator;
+import org.apache.flink.util.IOUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.util.UUID;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import static org.apache.flink.configuration.StateChangelogOptions.PERIODIC_MATERIALIZATION_INTERVAL;
+
+/** StateChangeIterator with local cache. */
+class StateChangeIteratorWithCache extends StateChangeIteratorImpl {

Review Comment:
   This class currently has two responsibilities: 1. Caching; 2. Iteration
   
   This leads to code duplication and higher complexity IMO.
   
   Can it only be responsible for caching? I.e for creating `FSDataInputStream` from `StreamStateHandle`?
   Then, we'd have a trivial implementation `handle -> handle.openInputStream` and a caching one.
   And then it could be injected into the iterator in `FsStateChangelogStorageForRecovery.createReader`.
   
   WDYT?



##########
flink-dstl/flink-dstl-dfs/src/main/java/org/apache/flink/changelog/fs/StateChangeIteratorWithCache.java:
##########
@@ -0,0 +1,367 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.changelog.fs;
+
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.ConfigurationUtils;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.core.memory.DataInputViewStreamWrapper;
+import org.apache.flink.runtime.state.StreamStateHandle;
+import org.apache.flink.runtime.state.changelog.StateChange;
+import org.apache.flink.runtime.state.filesystem.FileStateHandle;
+import org.apache.flink.util.CloseableIterator;
+import org.apache.flink.util.IOUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.util.UUID;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import static org.apache.flink.configuration.StateChangelogOptions.PERIODIC_MATERIALIZATION_INTERVAL;
+
+/** StateChangeIterator with local cache. */
+class StateChangeIteratorWithCache extends StateChangeIteratorImpl {
+    private static final Logger LOG = LoggerFactory.getLogger(StateChangeIteratorWithCache.class);
+
+    private static final String CACHE_FILE_PREFIX = "dstl-";
+
+    private final File cacheDir;
+    private final ConcurrentMap<Path, FileCache> cache = new ConcurrentHashMap<>();
+    private final ScheduledExecutorService cacheCleanScheduler;
+    private final ExecutorService downloadExecutor;
+    private final long cacheIdleMillis;
+
+    StateChangeIteratorWithCache(ExecutorService downloadExecutor, Configuration config) {
+        // TODO: 2022/5/31 add a new options for cache idle
+        long cacheIdleMillis = config.get(PERIODIC_MATERIALIZATION_INTERVAL).toMillis();
+        File cacheDir = ConfigurationUtils.getRandomTempDirectory(config);
+
+        this.cacheCleanScheduler =
+                SchedulerFactory.create(1, "ChangelogCacheFileCleanScheduler", LOG);
+        this.downloadExecutor = downloadExecutor;
+        this.cacheIdleMillis = cacheIdleMillis;
+        this.cacheDir = cacheDir;
+    }
+
+    @Override
+    public CloseableIterator<StateChange> read(StreamStateHandle handle, long offset)
+            throws IOException {
+
+        if (!(handle instanceof FileStateHandle)) {
+            return new StateChangeFormat().read(wrapAndSeek(handle.openInputStream(), offset));
+        }
+
+        FileStateHandle fileHandle = (FileStateHandle) handle;
+        DataInputStream input;
+
+        if (fileHandle.getFilePath().getFileSystem().isDistributedFS()) {
+
+            Path dfsPath = fileHandle.getFilePath();
+            FileCache fileCache =
+                    cache.computeIfAbsent(
+                            dfsPath,
+                            key -> {
+                                FileCache fCache = new FileCache(cacheDir);
+                                downloadExecutor.execute(() -> downloadFile(fileHandle, fCache));
+                                return fCache;
+                            });
+
+            FileInputStream fin = fileCache.openAndSeek(offset);
+
+            input =
+                    new DataInputStream(new BufferedInputStream(fin)) {
+                        @Override
+                        public void close() throws IOException {
+                            super.close();
+                            if (fileCache.getRefCount() == 0) {
+                                cacheCleanScheduler.schedule(
+                                        () -> cleanFileCache(dfsPath, fileCache),
+                                        cacheIdleMillis,
+                                        TimeUnit.MILLISECONDS);
+                            }
+                        }
+                    };
+        } else {
+            input = wrapAndSeek(handle.openInputStream(), offset);
+        }
+
+        return new StateChangeFormat().read(input);
+    }
+
+    private DataInputViewStreamWrapper wrapAndSeek(InputStream stream, long offset)
+            throws IOException {
+        DataInputViewStreamWrapper wrappedStream = wrap(stream);
+        if (offset != 0) {
+            LOG.debug("seek to {}", offset);
+            wrappedStream.skipBytesToRead((int) offset);
+        }
+        return wrappedStream;
+    }
+
+    private void downloadFile(FileStateHandle handle, FileCache fileCache) {
+        try {
+            IOUtils.copyBytes(
+                    wrap(handle.openInputStream()), fileCache.getOutputStreamForSaveCacheData());
+            LOG.debug(
+                    "download and decompress dstl file : {} to local cache file : {}",
+                    handle.getFilePath(),
+                    fileCache.getFilePath());
+
+        } catch (IOException e) {
+            fileCache.setSaveCacheDataException(e);
+        }
+    }
+
+    private void cleanFileCache(Path dfsPath, FileCache fileCache) {
+        if (fileCache.getRefCount() == 0) {
+            LOG.debug("clean local cache file : {}", fileCache.getFilePath());
+            cache.remove(dfsPath);
+            fileCache.discard();
+        }
+    }
+
+    static class FileCache {
+
+        private final File cacheDir;
+        private final AtomicLong writeInBytes;
+        private final AtomicBoolean writeComplete;
+        private final AtomicInteger refCount;
+        private final CountDownLatch readLatch;
+
+        private volatile File file;
+        private volatile FileOutputStream fo;
+        private volatile Exception saveCacheDataException;
+
+        FileCache(File cacheDir) {
+            this.cacheDir = cacheDir;
+            this.writeInBytes = new AtomicLong(0);
+            this.writeComplete = new AtomicBoolean(false);
+            this.refCount = new AtomicInteger(0);
+            this.readLatch = new CountDownLatch(1);
+        }
+
+        String getFilePath() {
+            return this.file.getAbsolutePath();
+        }
+
+        OutputStream getOutputStreamForSaveCacheData() throws IOException {
+            synchronized (this) {
+                if (fo == null) {
+                    file = File.createTempFile(CACHE_FILE_PREFIX, null, cacheDir);
+                    fo = new FileOutputStream(file);
+                    readLatch.countDown();
+                } else {
+                    throw new IllegalStateException("only can get OutputStream once !");
+                }
+            }
+
+            return new OutputStream() {
+                @Override
+                public void write(int b) throws IOException {
+                    fo.write(b);
+                    writeInBytes.incrementAndGet();
+                }
+
+                @Override
+                public void write(byte[] b, int off, int len) throws IOException {
+                    fo.write(b, off, len);
+                    writeInBytes.addAndGet(len);
+                }
+
+                @Override
+                public void close() throws IOException {
+                    fo.close();
+                    writeComplete.set(true);
+                }
+            };
+        }
+
+        void setSaveCacheDataException(Exception e) {
+            this.saveCacheDataException = e;
+        }
+
+        int getRefCount() {
+            return refCount.get();
+        }
+
+        private void handoverException() throws IOException {
+            if (saveCacheDataException != null) {
+                throw new IOException(
+                        "there is a exception when save data to cache file : ",
+                        saveCacheDataException);
+            }
+        }
+
+        FileInputStream open() throws IOException {
+            return open0();
+        }
+
+        FileInputStream openAndSeek(long offset) throws IOException {
+            FileInputStream fin = open0();
+            if (offset != 0) {
+                LOG.debug("seek to {}", offset);
+                fin.getChannel().position(offset);
+            }
+            return fin;
+        }
+
+        private FileInputStream open0() throws IOException {
+            try {
+                readLatch.await();
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+
+            refCount.incrementAndGet();
+
+            return new FileInputStream(file) {

Review Comment:
   This creates a new file descriptor and potentially calls `fopen` for the same file - for every offset in it.
   Can we reuse the stream, at least for the same thread?
   Or maybe it would be simpler to serialize all accesses to the same file in this PR? Seems like it will change the solution substantially, e.g. by alleviating all the concurrency in `FileCache`.



##########
flink-dstl/flink-dstl-dfs/src/main/java/org/apache/flink/changelog/fs/StateChangeIteratorWithCache.java:
##########
@@ -0,0 +1,367 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.changelog.fs;
+
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.ConfigurationUtils;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.core.memory.DataInputViewStreamWrapper;
+import org.apache.flink.runtime.state.StreamStateHandle;
+import org.apache.flink.runtime.state.changelog.StateChange;
+import org.apache.flink.runtime.state.filesystem.FileStateHandle;
+import org.apache.flink.util.CloseableIterator;
+import org.apache.flink.util.IOUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.util.UUID;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import static org.apache.flink.configuration.StateChangelogOptions.PERIODIC_MATERIALIZATION_INTERVAL;
+
+/** StateChangeIterator with local cache. */
+class StateChangeIteratorWithCache extends StateChangeIteratorImpl {
+    private static final Logger LOG = LoggerFactory.getLogger(StateChangeIteratorWithCache.class);
+
+    private static final String CACHE_FILE_PREFIX = "dstl-";
+
+    private final File cacheDir;
+    private final ConcurrentMap<Path, FileCache> cache = new ConcurrentHashMap<>();
+    private final ScheduledExecutorService cacheCleanScheduler;
+    private final ExecutorService downloadExecutor;
+    private final long cacheIdleMillis;
+
+    StateChangeIteratorWithCache(ExecutorService downloadExecutor, Configuration config) {
+        // TODO: 2022/5/31 add a new options for cache idle
+        long cacheIdleMillis = config.get(PERIODIC_MATERIALIZATION_INTERVAL).toMillis();
+        File cacheDir = ConfigurationUtils.getRandomTempDirectory(config);
+
+        this.cacheCleanScheduler =
+                SchedulerFactory.create(1, "ChangelogCacheFileCleanScheduler", LOG);
+        this.downloadExecutor = downloadExecutor;
+        this.cacheIdleMillis = cacheIdleMillis;
+        this.cacheDir = cacheDir;
+    }
+
+    @Override
+    public CloseableIterator<StateChange> read(StreamStateHandle handle, long offset)
+            throws IOException {
+
+        if (!(handle instanceof FileStateHandle)) {
+            return new StateChangeFormat().read(wrapAndSeek(handle.openInputStream(), offset));
+        }
+
+        FileStateHandle fileHandle = (FileStateHandle) handle;
+        DataInputStream input;
+
+        if (fileHandle.getFilePath().getFileSystem().isDistributedFS()) {
+
+            Path dfsPath = fileHandle.getFilePath();
+            FileCache fileCache =
+                    cache.computeIfAbsent(
+                            dfsPath,
+                            key -> {
+                                FileCache fCache = new FileCache(cacheDir);
+                                downloadExecutor.execute(() -> downloadFile(fileHandle, fCache));
+                                return fCache;
+                            });
+
+            FileInputStream fin = fileCache.openAndSeek(offset);
+
+            input =
+                    new DataInputStream(new BufferedInputStream(fin)) {
+                        @Override
+                        public void close() throws IOException {
+                            super.close();

Review Comment:
   Wrap with `try/finally`?



##########
flink-dstl/flink-dstl-dfs/src/main/java/org/apache/flink/changelog/fs/StateChangeIteratorWithCache.java:
##########
@@ -0,0 +1,367 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.changelog.fs;
+
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.ConfigurationUtils;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.core.memory.DataInputViewStreamWrapper;
+import org.apache.flink.runtime.state.StreamStateHandle;
+import org.apache.flink.runtime.state.changelog.StateChange;
+import org.apache.flink.runtime.state.filesystem.FileStateHandle;
+import org.apache.flink.util.CloseableIterator;
+import org.apache.flink.util.IOUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.util.UUID;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import static org.apache.flink.configuration.StateChangelogOptions.PERIODIC_MATERIALIZATION_INTERVAL;
+
+/** StateChangeIterator with local cache. */
+class StateChangeIteratorWithCache extends StateChangeIteratorImpl {
+    private static final Logger LOG = LoggerFactory.getLogger(StateChangeIteratorWithCache.class);
+
+    private static final String CACHE_FILE_PREFIX = "dstl-";
+
+    private final File cacheDir;
+    private final ConcurrentMap<Path, FileCache> cache = new ConcurrentHashMap<>();
+    private final ScheduledExecutorService cacheCleanScheduler;
+    private final ExecutorService downloadExecutor;
+    private final long cacheIdleMillis;
+
+    StateChangeIteratorWithCache(ExecutorService downloadExecutor, Configuration config) {
+        // TODO: 2022/5/31 add a new options for cache idle
+        long cacheIdleMillis = config.get(PERIODIC_MATERIALIZATION_INTERVAL).toMillis();
+        File cacheDir = ConfigurationUtils.getRandomTempDirectory(config);
+
+        this.cacheCleanScheduler =
+                SchedulerFactory.create(1, "ChangelogCacheFileCleanScheduler", LOG);
+        this.downloadExecutor = downloadExecutor;
+        this.cacheIdleMillis = cacheIdleMillis;
+        this.cacheDir = cacheDir;
+    }
+
+    @Override
+    public CloseableIterator<StateChange> read(StreamStateHandle handle, long offset)
+            throws IOException {
+
+        if (!(handle instanceof FileStateHandle)) {
+            return new StateChangeFormat().read(wrapAndSeek(handle.openInputStream(), offset));
+        }
+
+        FileStateHandle fileHandle = (FileStateHandle) handle;
+        DataInputStream input;
+
+        if (fileHandle.getFilePath().getFileSystem().isDistributedFS()) {
+
+            Path dfsPath = fileHandle.getFilePath();
+            FileCache fileCache =
+                    cache.computeIfAbsent(
+                            dfsPath,
+                            key -> {
+                                FileCache fCache = new FileCache(cacheDir);
+                                downloadExecutor.execute(() -> downloadFile(fileHandle, fCache));
+                                return fCache;
+                            });
+
+            FileInputStream fin = fileCache.openAndSeek(offset);
+
+            input =
+                    new DataInputStream(new BufferedInputStream(fin)) {
+                        @Override
+                        public void close() throws IOException {
+                            super.close();
+                            if (fileCache.getRefCount() == 0) {
+                                cacheCleanScheduler.schedule(
+                                        () -> cleanFileCache(dfsPath, fileCache),
+                                        cacheIdleMillis,
+                                        TimeUnit.MILLISECONDS);
+                            }
+                        }
+                    };
+        } else {
+            input = wrapAndSeek(handle.openInputStream(), offset);
+        }
+
+        return new StateChangeFormat().read(input);
+    }
+
+    private DataInputViewStreamWrapper wrapAndSeek(InputStream stream, long offset)
+            throws IOException {
+        DataInputViewStreamWrapper wrappedStream = wrap(stream);
+        if (offset != 0) {
+            LOG.debug("seek to {}", offset);
+            wrappedStream.skipBytesToRead((int) offset);
+        }
+        return wrappedStream;
+    }
+
+    private void downloadFile(FileStateHandle handle, FileCache fileCache) {
+        try {
+            IOUtils.copyBytes(
+                    wrap(handle.openInputStream()), fileCache.getOutputStreamForSaveCacheData());
+            LOG.debug(
+                    "download and decompress dstl file : {} to local cache file : {}",
+                    handle.getFilePath(),
+                    fileCache.getFilePath());
+
+        } catch (IOException e) {
+            fileCache.setSaveCacheDataException(e);
+        }
+    }
+
+    private void cleanFileCache(Path dfsPath, FileCache fileCache) {
+        if (fileCache.getRefCount() == 0) {
+            LOG.debug("clean local cache file : {}", fileCache.getFilePath());
+            cache.remove(dfsPath);
+            fileCache.discard();

Review Comment:
   I think there is a race condition here:
   - thread T1 checks the count and sees 0, 
   - T2 increments the count and starts reading
   - T1 removes the file from map and discards it (while it's being read)



##########
flink-dstl/flink-dstl-dfs/src/main/java/org/apache/flink/changelog/fs/StateChangeIteratorWithCache.java:
##########
@@ -0,0 +1,367 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.changelog.fs;
+
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.ConfigurationUtils;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.core.memory.DataInputViewStreamWrapper;
+import org.apache.flink.runtime.state.StreamStateHandle;
+import org.apache.flink.runtime.state.changelog.StateChange;
+import org.apache.flink.runtime.state.filesystem.FileStateHandle;
+import org.apache.flink.util.CloseableIterator;
+import org.apache.flink.util.IOUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.util.UUID;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import static org.apache.flink.configuration.StateChangelogOptions.PERIODIC_MATERIALIZATION_INTERVAL;
+
+/** StateChangeIterator with local cache. */
+class StateChangeIteratorWithCache extends StateChangeIteratorImpl {
+    private static final Logger LOG = LoggerFactory.getLogger(StateChangeIteratorWithCache.class);
+
+    private static final String CACHE_FILE_PREFIX = "dstl-";
+
+    private final File cacheDir;
+    private final ConcurrentMap<Path, FileCache> cache = new ConcurrentHashMap<>();
+    private final ScheduledExecutorService cacheCleanScheduler;
+    private final ExecutorService downloadExecutor;
+    private final long cacheIdleMillis;
+
+    StateChangeIteratorWithCache(ExecutorService downloadExecutor, Configuration config) {
+        // TODO: 2022/5/31 add a new options for cache idle
+        long cacheIdleMillis = config.get(PERIODIC_MATERIALIZATION_INTERVAL).toMillis();
+        File cacheDir = ConfigurationUtils.getRandomTempDirectory(config);
+
+        this.cacheCleanScheduler =
+                SchedulerFactory.create(1, "ChangelogCacheFileCleanScheduler", LOG);
+        this.downloadExecutor = downloadExecutor;
+        this.cacheIdleMillis = cacheIdleMillis;
+        this.cacheDir = cacheDir;
+    }
+
+    @Override
+    public CloseableIterator<StateChange> read(StreamStateHandle handle, long offset)
+            throws IOException {
+
+        if (!(handle instanceof FileStateHandle)) {
+            return new StateChangeFormat().read(wrapAndSeek(handle.openInputStream(), offset));
+        }
+
+        FileStateHandle fileHandle = (FileStateHandle) handle;
+        DataInputStream input;
+
+        if (fileHandle.getFilePath().getFileSystem().isDistributedFS()) {
+
+            Path dfsPath = fileHandle.getFilePath();
+            FileCache fileCache =
+                    cache.computeIfAbsent(
+                            dfsPath,
+                            key -> {
+                                FileCache fCache = new FileCache(cacheDir);
+                                downloadExecutor.execute(() -> downloadFile(fileHandle, fCache));
+                                return fCache;
+                            });
+
+            FileInputStream fin = fileCache.openAndSeek(offset);
+
+            input =
+                    new DataInputStream(new BufferedInputStream(fin)) {
+                        @Override
+                        public void close() throws IOException {
+                            super.close();
+                            if (fileCache.getRefCount() == 0) {
+                                cacheCleanScheduler.schedule(
+                                        () -> cleanFileCache(dfsPath, fileCache),
+                                        cacheIdleMillis,
+                                        TimeUnit.MILLISECONDS);
+                            }
+                        }
+                    };
+        } else {
+            input = wrapAndSeek(handle.openInputStream(), offset);
+        }
+
+        return new StateChangeFormat().read(input);
+    }
+
+    private DataInputViewStreamWrapper wrapAndSeek(InputStream stream, long offset)
+            throws IOException {
+        DataInputViewStreamWrapper wrappedStream = wrap(stream);
+        if (offset != 0) {
+            LOG.debug("seek to {}", offset);
+            wrappedStream.skipBytesToRead((int) offset);
+        }
+        return wrappedStream;
+    }
+
+    private void downloadFile(FileStateHandle handle, FileCache fileCache) {
+        try {
+            IOUtils.copyBytes(
+                    wrap(handle.openInputStream()), fileCache.getOutputStreamForSaveCacheData());
+            LOG.debug(
+                    "download and decompress dstl file : {} to local cache file : {}",
+                    handle.getFilePath(),
+                    fileCache.getFilePath());
+
+        } catch (IOException e) {
+            fileCache.setSaveCacheDataException(e);
+        }
+    }
+
+    private void cleanFileCache(Path dfsPath, FileCache fileCache) {
+        if (fileCache.getRefCount() == 0) {
+            LOG.debug("clean local cache file : {}", fileCache.getFilePath());
+            cache.remove(dfsPath);
+            fileCache.discard();
+        }
+    }
+
+    static class FileCache {
+
+        private final File cacheDir;
+        private final AtomicLong writeInBytes;
+        private final AtomicBoolean writeComplete;
+        private final AtomicInteger refCount;
+        private final CountDownLatch readLatch;
+
+        private volatile File file;
+        private volatile FileOutputStream fo;
+        private volatile Exception saveCacheDataException;
+
+        FileCache(File cacheDir) {
+            this.cacheDir = cacheDir;
+            this.writeInBytes = new AtomicLong(0);
+            this.writeComplete = new AtomicBoolean(false);
+            this.refCount = new AtomicInteger(0);
+            this.readLatch = new CountDownLatch(1);
+        }
+
+        String getFilePath() {
+            return this.file.getAbsolutePath();
+        }
+
+        OutputStream getOutputStreamForSaveCacheData() throws IOException {
+            synchronized (this) {
+                if (fo == null) {
+                    file = File.createTempFile(CACHE_FILE_PREFIX, null, cacheDir);
+                    fo = new FileOutputStream(file);
+                    readLatch.countDown();
+                } else {
+                    throw new IllegalStateException("only can get OutputStream once !");
+                }
+            }
+
+            return new OutputStream() {
+                @Override
+                public void write(int b) throws IOException {
+                    fo.write(b);
+                    writeInBytes.incrementAndGet();
+                }
+
+                @Override
+                public void write(byte[] b, int off, int len) throws IOException {
+                    fo.write(b, off, len);
+                    writeInBytes.addAndGet(len);
+                }
+
+                @Override
+                public void close() throws IOException {
+                    fo.close();
+                    writeComplete.set(true);
+                }
+            };
+        }
+
+        void setSaveCacheDataException(Exception e) {
+            this.saveCacheDataException = e;
+        }
+
+        int getRefCount() {
+            return refCount.get();
+        }
+
+        private void handoverException() throws IOException {
+            if (saveCacheDataException != null) {
+                throw new IOException(
+                        "there is a exception when save data to cache file : ",
+                        saveCacheDataException);
+            }
+        }
+
+        FileInputStream open() throws IOException {
+            return open0();
+        }
+
+        FileInputStream openAndSeek(long offset) throws IOException {
+            FileInputStream fin = open0();
+            if (offset != 0) {
+                LOG.debug("seek to {}", offset);
+                fin.getChannel().position(offset);
+            }
+            return fin;
+        }
+
+        private FileInputStream open0() throws IOException {
+            try {
+                readLatch.await();
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+
+            refCount.incrementAndGet();
+
+            return new FileInputStream(file) {
+                private final String id = UUID.randomUUID().toString();
+                private final AtomicBoolean closed = new AtomicBoolean(false);
+
+                @Override
+                public int read() throws IOException {
+                    handoverException();
+                    if (writeComplete.get()) {
+                        return super.read();
+                    } else {
+                        int data = super.read();
+                        if (data != -1) {
+                            return data;
+                        } else {
+                            waitWrite();
+                            return read();
+                        }
+                    }
+                }
+
+                @Override
+                public int read(byte[] b) throws IOException {
+                    handoverException();
+                    if (writeComplete.get()) {
+                        return super.read(b);
+                    } else {
+                        int count = super.read(b);
+                        if (count == -1) {
+                            return count;
+                        } else {
+                            waitWrite();
+                            return read(b);
+                        }
+                    }
+                }
+
+                @Override
+                public int read(byte[] b, int off, int len) throws IOException {
+                    handoverException();
+                    if (writeComplete.get()) {
+                        return super.read(b, off, len);
+                    } else {
+                        int count = super.read(b, off, len);
+                        if (count != -1) {
+                            return count;
+                        } else {
+                            waitWrite();
+                            return read(b, off, len);
+                        }
+                    }
+                }
+
+                @Override
+                public long skip(long n) throws IOException {
+                    handoverException();
+                    if (writeComplete.get()) {
+                        return super.skip(n);
+                    } else {
+                        long skips = super.skip(n);
+                        if (skips == n) {
+                            return skips;
+                        } else {
+                            waitWrite();
+                            return skip(n - skips);
+                        }
+                    }
+                }
+
+                @Override
+                public int available() throws IOException {
+                    handoverException();
+                    if (writeComplete.get()) {
+                        return super.available();
+                    } else {
+                        int count = super.available();
+                        if (count != 0) {
+                            return count;
+                        } else {
+                            return 1;
+                        }
+                    }
+                }
+
+                private void waitWrite() {
+                    long writeInBytes0 = writeInBytes.get();
+                    while (writeInBytes0 == writeInBytes.get() && !writeComplete.get()) {
+                        try {
+                            synchronized (fo) {
+                                fo.wait(10);
+                            }
+                        } catch (InterruptedException e) {
+                            Thread.currentThread().interrupt();

Review Comment:
   Shouldn't it be rethrown (in addition to setting the interrupt flag)?



##########
flink-state-backends/flink-statebackend-changelog/src/main/java/org/apache/flink/state/changelog/ChangelogStateBackend.java:
##########
@@ -87,9 +87,14 @@ protected <K> CheckpointableKeyedStateBackend<K> restore(
         String subtaskName = env.getTaskInfo().getTaskNameWithSubtasks();
         ExecutionConfig executionConfig = env.getExecutionConfig();
 
+        env.getAsyncOperationsThreadPool();
+
         ChangelogStateFactory changelogStateFactory = new ChangelogStateFactory();
         CheckpointableKeyedStateBackend<K> keyedStateBackend =
                 ChangelogBackendRestoreOperation.restore(
+                        env.getJobID(),
+                        env.getAsyncOperationsThreadPool(),
+                        env.getTaskManagerInfo().getConfiguration(),

Review Comment:
   I think this should be `env.getJobConfiguration()` rather than `env.getTaskManagerInfo().getConfiguration()`
   
   cc: @fredia 



##########
flink-runtime/src/main/java/org/apache/flink/runtime/state/changelog/StateChangelogStorageLoader.java:
##########
@@ -104,21 +114,35 @@ public static StateChangelogStorage<?> load(
 
     @Nonnull
     public static StateChangelogStorageView<?> loadFromStateHandle(
-            ChangelogStateHandle changelogStateHandle) throws IOException {
+            JobID jobID,
+            ExecutorService asyncExecutor,
+            Configuration configuration,
+            ChangelogStateHandle changelogStateHandle)
+            throws IOException {
         StateChangelogStorageFactory factory =
                 STATE_CHANGELOG_STORAGE_FACTORIES.get(changelogStateHandle.getStorageIdentifier());
+
         if (factory == null) {
             throw new FlinkRuntimeException(
                     String.format(
                             "Cannot find a factory for changelog storage with name '%s' to restore from '%s'.",
                             changelogStateHandle.getStorageIdentifier(),
                             changelogStateHandle.getClass().getSimpleName()));
-        } else {
-            LOG.info(
-                    "Creating a changelog storage with name '{}' to restore from '{}'.",
-                    changelogStateHandle.getStorageIdentifier(),
-                    changelogStateHandle.getClass().getSimpleName());
-            return factory.createStorageView();
         }
+
+        if (!changelogStorageViewsByJobId.containsKey(jobID)) {
+            synchronized (lock) {
+                if (!changelogStorageViewsByJobId.containsKey(jobID)) {

Review Comment:
   Can this be replaced with `computeIfAbsent`?



##########
flink-state-backends/flink-statebackend-changelog/src/main/java/org/apache/flink/state/changelog/ChangelogStateBackend.java:
##########
@@ -87,9 +87,14 @@ protected <K> CheckpointableKeyedStateBackend<K> restore(
         String subtaskName = env.getTaskInfo().getTaskNameWithSubtasks();
         ExecutionConfig executionConfig = env.getExecutionConfig();
 
+        env.getAsyncOperationsThreadPool();
+

Review Comment:
   This seems unnecessary.



##########
flink-runtime/src/main/java/org/apache/flink/runtime/state/changelog/StateChangelogStorageLoader.java:
##########
@@ -50,6 +54,12 @@ public class StateChangelogStorageLoader {
     private static final HashMap<String, StateChangelogStorageFactory>
             STATE_CHANGELOG_STORAGE_FACTORIES = new HashMap<>();
 
+    private static final Object lock = new Object();
+
+    @GuardedBy("lock")
+    private static final ConcurrentHashMap<JobID, StateChangelogStorageView<?>>
+            changelogStorageViewsByJobId = new ConcurrentHashMap<>();

Review Comment:
   I couldn't find the cleanup of this map.
   
   I think that `GuardedBy` could be removed when addressed the comment below.



##########
flink-dstl/flink-dstl-dfs/src/main/java/org/apache/flink/changelog/fs/StateChangeIteratorWithCache.java:
##########
@@ -0,0 +1,367 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.changelog.fs;
+
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.ConfigurationUtils;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.core.memory.DataInputViewStreamWrapper;
+import org.apache.flink.runtime.state.StreamStateHandle;
+import org.apache.flink.runtime.state.changelog.StateChange;
+import org.apache.flink.runtime.state.filesystem.FileStateHandle;
+import org.apache.flink.util.CloseableIterator;
+import org.apache.flink.util.IOUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.util.UUID;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import static org.apache.flink.configuration.StateChangelogOptions.PERIODIC_MATERIALIZATION_INTERVAL;
+
+/** StateChangeIterator with local cache. */
+class StateChangeIteratorWithCache extends StateChangeIteratorImpl {
+    private static final Logger LOG = LoggerFactory.getLogger(StateChangeIteratorWithCache.class);
+
+    private static final String CACHE_FILE_PREFIX = "dstl-";
+
+    private final File cacheDir;
+    private final ConcurrentMap<Path, FileCache> cache = new ConcurrentHashMap<>();
+    private final ScheduledExecutorService cacheCleanScheduler;
+    private final ExecutorService downloadExecutor;
+    private final long cacheIdleMillis;
+
+    StateChangeIteratorWithCache(ExecutorService downloadExecutor, Configuration config) {
+        // TODO: 2022/5/31 add a new options for cache idle
+        long cacheIdleMillis = config.get(PERIODIC_MATERIALIZATION_INTERVAL).toMillis();
+        File cacheDir = ConfigurationUtils.getRandomTempDirectory(config);
+
+        this.cacheCleanScheduler =
+                SchedulerFactory.create(1, "ChangelogCacheFileCleanScheduler", LOG);
+        this.downloadExecutor = downloadExecutor;
+        this.cacheIdleMillis = cacheIdleMillis;
+        this.cacheDir = cacheDir;
+    }
+
+    @Override
+    public CloseableIterator<StateChange> read(StreamStateHandle handle, long offset)
+            throws IOException {
+
+        if (!(handle instanceof FileStateHandle)) {
+            return new StateChangeFormat().read(wrapAndSeek(handle.openInputStream(), offset));
+        }
+
+        FileStateHandle fileHandle = (FileStateHandle) handle;
+        DataInputStream input;
+
+        if (fileHandle.getFilePath().getFileSystem().isDistributedFS()) {
+
+            Path dfsPath = fileHandle.getFilePath();
+            FileCache fileCache =
+                    cache.computeIfAbsent(
+                            dfsPath,
+                            key -> {
+                                FileCache fCache = new FileCache(cacheDir);
+                                downloadExecutor.execute(() -> downloadFile(fileHandle, fCache));
+                                return fCache;
+                            });
+
+            FileInputStream fin = fileCache.openAndSeek(offset);
+
+            input =
+                    new DataInputStream(new BufferedInputStream(fin)) {
+                        @Override
+                        public void close() throws IOException {
+                            super.close();
+                            if (fileCache.getRefCount() == 0) {
+                                cacheCleanScheduler.schedule(
+                                        () -> cleanFileCache(dfsPath, fileCache),
+                                        cacheIdleMillis,
+                                        TimeUnit.MILLISECONDS);
+                            }
+                        }
+                    };
+        } else {
+            input = wrapAndSeek(handle.openInputStream(), offset);
+        }
+
+        return new StateChangeFormat().read(input);
+    }
+
+    private DataInputViewStreamWrapper wrapAndSeek(InputStream stream, long offset)
+            throws IOException {
+        DataInputViewStreamWrapper wrappedStream = wrap(stream);
+        if (offset != 0) {
+            LOG.debug("seek to {}", offset);
+            wrappedStream.skipBytesToRead((int) offset);
+        }
+        return wrappedStream;
+    }
+
+    private void downloadFile(FileStateHandle handle, FileCache fileCache) {
+        try {
+            IOUtils.copyBytes(
+                    wrap(handle.openInputStream()), fileCache.getOutputStreamForSaveCacheData());
+            LOG.debug(
+                    "download and decompress dstl file : {} to local cache file : {}",
+                    handle.getFilePath(),
+                    fileCache.getFilePath());
+
+        } catch (IOException e) {
+            fileCache.setSaveCacheDataException(e);
+        }
+    }
+
+    private void cleanFileCache(Path dfsPath, FileCache fileCache) {
+        if (fileCache.getRefCount() == 0) {
+            LOG.debug("clean local cache file : {}", fileCache.getFilePath());
+            cache.remove(dfsPath);
+            fileCache.discard();
+        }
+    }
+
+    static class FileCache {
+
+        private final File cacheDir;
+        private final AtomicLong writeInBytes;
+        private final AtomicBoolean writeComplete;
+        private final AtomicInteger refCount;
+        private final CountDownLatch readLatch;
+
+        private volatile File file;
+        private volatile FileOutputStream fo;
+        private volatile Exception saveCacheDataException;
+
+        FileCache(File cacheDir) {
+            this.cacheDir = cacheDir;
+            this.writeInBytes = new AtomicLong(0);
+            this.writeComplete = new AtomicBoolean(false);
+            this.refCount = new AtomicInteger(0);
+            this.readLatch = new CountDownLatch(1);
+        }
+
+        String getFilePath() {
+            return this.file.getAbsolutePath();
+        }
+
+        OutputStream getOutputStreamForSaveCacheData() throws IOException {
+            synchronized (this) {
+                if (fo == null) {
+                    file = File.createTempFile(CACHE_FILE_PREFIX, null, cacheDir);
+                    fo = new FileOutputStream(file);
+                    readLatch.countDown();
+                } else {
+                    throw new IllegalStateException("only can get OutputStream once !");
+                }
+            }
+
+            return new OutputStream() {
+                @Override
+                public void write(int b) throws IOException {
+                    fo.write(b);
+                    writeInBytes.incrementAndGet();
+                }
+
+                @Override
+                public void write(byte[] b, int off, int len) throws IOException {
+                    fo.write(b, off, len);
+                    writeInBytes.addAndGet(len);
+                }
+
+                @Override
+                public void close() throws IOException {
+                    fo.close();
+                    writeComplete.set(true);
+                }
+            };
+        }
+
+        void setSaveCacheDataException(Exception e) {
+            this.saveCacheDataException = e;
+        }
+
+        int getRefCount() {
+            return refCount.get();
+        }
+
+        private void handoverException() throws IOException {
+            if (saveCacheDataException != null) {
+                throw new IOException(
+                        "there is a exception when save data to cache file : ",
+                        saveCacheDataException);
+            }
+        }
+
+        FileInputStream open() throws IOException {
+            return open0();
+        }
+
+        FileInputStream openAndSeek(long offset) throws IOException {
+            FileInputStream fin = open0();
+            if (offset != 0) {
+                LOG.debug("seek to {}", offset);
+                fin.getChannel().position(offset);
+            }
+            return fin;
+        }
+
+        private FileInputStream open0() throws IOException {
+            try {
+                readLatch.await();
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+
+            refCount.incrementAndGet();
+
+            return new FileInputStream(file) {
+                private final String id = UUID.randomUUID().toString();
+                private final AtomicBoolean closed = new AtomicBoolean(false);
+
+                @Override
+                public int read() throws IOException {
+                    handoverException();
+                    if (writeComplete.get()) {
+                        return super.read();
+                    } else {
+                        int data = super.read();
+                        if (data != -1) {
+                            return data;
+                        } else {
+                            waitWrite();
+                            return read();
+                        }
+                    }
+                }
+
+                @Override
+                public int read(byte[] b) throws IOException {
+                    handoverException();
+                    if (writeComplete.get()) {
+                        return super.read(b);
+                    } else {
+                        int count = super.read(b);
+                        if (count == -1) {
+                            return count;
+                        } else {
+                            waitWrite();
+                            return read(b);
+                        }
+                    }
+                }
+
+                @Override
+                public int read(byte[] b, int off, int len) throws IOException {
+                    handoverException();
+                    if (writeComplete.get()) {
+                        return super.read(b, off, len);
+                    } else {
+                        int count = super.read(b, off, len);
+                        if (count != -1) {
+                            return count;
+                        } else {
+                            waitWrite();
+                            return read(b, off, len);
+                        }
+                    }
+                }
+
+                @Override
+                public long skip(long n) throws IOException {
+                    handoverException();
+                    if (writeComplete.get()) {
+                        return super.skip(n);
+                    } else {
+                        long skips = super.skip(n);
+                        if (skips == n) {
+                            return skips;
+                        } else {
+                            waitWrite();
+                            return skip(n - skips);
+                        }
+                    }
+                }
+
+                @Override
+                public int available() throws IOException {
+                    handoverException();
+                    if (writeComplete.get()) {
+                        return super.available();
+                    } else {
+                        int count = super.available();
+                        if (count != 0) {
+                            return count;
+                        } else {
+                            return 1;
+                        }
+                    }
+                }
+
+                private void waitWrite() {
+                    long writeInBytes0 = writeInBytes.get();
+                    while (writeInBytes0 == writeInBytes.get() && !writeComplete.get()) {
+                        try {
+                            synchronized (fo) {
+                                fo.wait(10);

Review Comment:
   I don't see the no corresponding `notifyAll`, so it should probably be just `sleep`
   (but see the comment above).



##########
flink-dstl/flink-dstl-dfs/src/main/java/org/apache/flink/changelog/fs/StateChangeIteratorWithCache.java:
##########
@@ -0,0 +1,367 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.changelog.fs;
+
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.ConfigurationUtils;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.core.memory.DataInputViewStreamWrapper;
+import org.apache.flink.runtime.state.StreamStateHandle;
+import org.apache.flink.runtime.state.changelog.StateChange;
+import org.apache.flink.runtime.state.filesystem.FileStateHandle;
+import org.apache.flink.util.CloseableIterator;
+import org.apache.flink.util.IOUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.util.UUID;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import static org.apache.flink.configuration.StateChangelogOptions.PERIODIC_MATERIALIZATION_INTERVAL;
+
+/** StateChangeIterator with local cache. */
+class StateChangeIteratorWithCache extends StateChangeIteratorImpl {
+    private static final Logger LOG = LoggerFactory.getLogger(StateChangeIteratorWithCache.class);
+
+    private static final String CACHE_FILE_PREFIX = "dstl-";
+
+    private final File cacheDir;
+    private final ConcurrentMap<Path, FileCache> cache = new ConcurrentHashMap<>();
+    private final ScheduledExecutorService cacheCleanScheduler;
+    private final ExecutorService downloadExecutor;
+    private final long cacheIdleMillis;
+
+    StateChangeIteratorWithCache(ExecutorService downloadExecutor, Configuration config) {
+        // TODO: 2022/5/31 add a new options for cache idle
+        long cacheIdleMillis = config.get(PERIODIC_MATERIALIZATION_INTERVAL).toMillis();
+        File cacheDir = ConfigurationUtils.getRandomTempDirectory(config);
+
+        this.cacheCleanScheduler =
+                SchedulerFactory.create(1, "ChangelogCacheFileCleanScheduler", LOG);
+        this.downloadExecutor = downloadExecutor;
+        this.cacheIdleMillis = cacheIdleMillis;
+        this.cacheDir = cacheDir;
+    }
+
+    @Override
+    public CloseableIterator<StateChange> read(StreamStateHandle handle, long offset)
+            throws IOException {
+
+        if (!(handle instanceof FileStateHandle)) {
+            return new StateChangeFormat().read(wrapAndSeek(handle.openInputStream(), offset));
+        }
+
+        FileStateHandle fileHandle = (FileStateHandle) handle;
+        DataInputStream input;
+
+        if (fileHandle.getFilePath().getFileSystem().isDistributedFS()) {
+
+            Path dfsPath = fileHandle.getFilePath();
+            FileCache fileCache =
+                    cache.computeIfAbsent(
+                            dfsPath,
+                            key -> {
+                                FileCache fCache = new FileCache(cacheDir);
+                                downloadExecutor.execute(() -> downloadFile(fileHandle, fCache));
+                                return fCache;
+                            });
+
+            FileInputStream fin = fileCache.openAndSeek(offset);
+
+            input =
+                    new DataInputStream(new BufferedInputStream(fin)) {
+                        @Override
+                        public void close() throws IOException {
+                            super.close();
+                            if (fileCache.getRefCount() == 0) {
+                                cacheCleanScheduler.schedule(
+                                        () -> cleanFileCache(dfsPath, fileCache),
+                                        cacheIdleMillis,
+                                        TimeUnit.MILLISECONDS);
+                            }
+                        }
+                    };
+        } else {
+            input = wrapAndSeek(handle.openInputStream(), offset);
+        }
+
+        return new StateChangeFormat().read(input);
+    }
+
+    private DataInputViewStreamWrapper wrapAndSeek(InputStream stream, long offset)
+            throws IOException {
+        DataInputViewStreamWrapper wrappedStream = wrap(stream);
+        if (offset != 0) {
+            LOG.debug("seek to {}", offset);
+            wrappedStream.skipBytesToRead((int) offset);
+        }
+        return wrappedStream;
+    }
+
+    private void downloadFile(FileStateHandle handle, FileCache fileCache) {
+        try {
+            IOUtils.copyBytes(
+                    wrap(handle.openInputStream()), fileCache.getOutputStreamForSaveCacheData());
+            LOG.debug(
+                    "download and decompress dstl file : {} to local cache file : {}",
+                    handle.getFilePath(),
+                    fileCache.getFilePath());
+
+        } catch (IOException e) {
+            fileCache.setSaveCacheDataException(e);
+        }
+    }
+
+    private void cleanFileCache(Path dfsPath, FileCache fileCache) {
+        if (fileCache.getRefCount() == 0) {
+            LOG.debug("clean local cache file : {}", fileCache.getFilePath());
+            cache.remove(dfsPath);
+            fileCache.discard();
+        }
+    }
+
+    static class FileCache {
+
+        private final File cacheDir;
+        private final AtomicLong writeInBytes;
+        private final AtomicBoolean writeComplete;
+        private final AtomicInteger refCount;
+        private final CountDownLatch readLatch;
+
+        private volatile File file;
+        private volatile FileOutputStream fo;
+        private volatile Exception saveCacheDataException;
+
+        FileCache(File cacheDir) {
+            this.cacheDir = cacheDir;
+            this.writeInBytes = new AtomicLong(0);
+            this.writeComplete = new AtomicBoolean(false);
+            this.refCount = new AtomicInteger(0);
+            this.readLatch = new CountDownLatch(1);
+        }
+
+        String getFilePath() {
+            return this.file.getAbsolutePath();
+        }
+
+        OutputStream getOutputStreamForSaveCacheData() throws IOException {
+            synchronized (this) {
+                if (fo == null) {
+                    file = File.createTempFile(CACHE_FILE_PREFIX, null, cacheDir);
+                    fo = new FileOutputStream(file);
+                    readLatch.countDown();
+                } else {
+                    throw new IllegalStateException("only can get OutputStream once !");
+                }
+            }
+
+            return new OutputStream() {
+                @Override
+                public void write(int b) throws IOException {
+                    fo.write(b);
+                    writeInBytes.incrementAndGet();
+                }
+
+                @Override
+                public void write(byte[] b, int off, int len) throws IOException {
+                    fo.write(b, off, len);
+                    writeInBytes.addAndGet(len);
+                }
+
+                @Override
+                public void close() throws IOException {
+                    fo.close();
+                    writeComplete.set(true);
+                }
+            };
+        }
+
+        void setSaveCacheDataException(Exception e) {
+            this.saveCacheDataException = e;
+        }
+
+        int getRefCount() {
+            return refCount.get();
+        }
+
+        private void handoverException() throws IOException {
+            if (saveCacheDataException != null) {
+                throw new IOException(
+                        "there is a exception when save data to cache file : ",
+                        saveCacheDataException);
+            }
+        }
+
+        FileInputStream open() throws IOException {
+            return open0();
+        }
+
+        FileInputStream openAndSeek(long offset) throws IOException {
+            FileInputStream fin = open0();
+            if (offset != 0) {
+                LOG.debug("seek to {}", offset);
+                fin.getChannel().position(offset);
+            }
+            return fin;
+        }
+
+        private FileInputStream open0() throws IOException {
+            try {
+                readLatch.await();
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+
+            refCount.incrementAndGet();
+
+            return new FileInputStream(file) {
+                private final String id = UUID.randomUUID().toString();
+                private final AtomicBoolean closed = new AtomicBoolean(false);
+
+                @Override
+                public int read() throws IOException {
+                    handoverException();
+                    if (writeComplete.get()) {
+                        return super.read();
+                    } else {
+                        int data = super.read();
+                        if (data != -1) {
+                            return data;
+                        } else {
+                            waitWrite();
+                            return read();
+                        }
+                    }
+                }
+
+                @Override
+                public int read(byte[] b) throws IOException {
+                    handoverException();
+                    if (writeComplete.get()) {
+                        return super.read(b);
+                    } else {
+                        int count = super.read(b);
+                        if (count == -1) {
+                            return count;
+                        } else {
+                            waitWrite();
+                            return read(b);
+                        }
+                    }
+                }
+
+                @Override
+                public int read(byte[] b, int off, int len) throws IOException {
+                    handoverException();
+                    if (writeComplete.get()) {
+                        return super.read(b, off, len);
+                    } else {
+                        int count = super.read(b, off, len);
+                        if (count != -1) {
+                            return count;
+                        } else {
+                            waitWrite();
+                            return read(b, off, len);
+                        }
+                    }
+                }
+
+                @Override
+                public long skip(long n) throws IOException {
+                    handoverException();
+                    if (writeComplete.get()) {
+                        return super.skip(n);
+                    } else {
+                        long skips = super.skip(n);
+                        if (skips == n) {
+                            return skips;
+                        } else {
+                            waitWrite();
+                            return skip(n - skips);
+                        }
+                    }
+                }
+
+                @Override
+                public int available() throws IOException {
+                    handoverException();
+                    if (writeComplete.get()) {
+                        return super.available();
+                    } else {
+                        int count = super.available();
+                        if (count != 0) {
+                            return count;
+                        } else {
+                            return 1;
+                        }
+                    }
+                }
+
+                private void waitWrite() {
+                    long writeInBytes0 = writeInBytes.get();
+                    while (writeInBytes0 == writeInBytes.get() && !writeComplete.get()) {
+                        try {
+                            synchronized (fo) {

Review Comment:
   Do we really need to allow partial reads before the whole file is downloaded?
   I think the added complexity isn't worth it (and if it is, the optimization can be added later).
   
   Also, performance-wise it could be worse because of the CAS operation and potential context switches on every write.



##########
flink-dstl/flink-dstl-dfs/src/main/java/org/apache/flink/changelog/fs/FsStateChangelogStorageForRecovery.java:
##########
@@ -27,14 +28,31 @@
 
 import javax.annotation.concurrent.ThreadSafe;
 
+import java.util.concurrent.ExecutorService;
+
 /** Filesystem-based implementation of {@link StateChangelogStorage} just for recovery. */
 @Experimental
 @ThreadSafe
 public class FsStateChangelogStorageForRecovery
         implements StateChangelogStorageView<ChangelogStateHandleStreamImpl> {
 
+    private final StateChangeIteratorWithCache stateChangeIteratorWithCache;
+
+    public FsStateChangelogStorageForRecovery() {
+        this.stateChangeIteratorWithCache = null;

Review Comment:
   Can we move `new StateChangeIteratorImpl()` here to avoid nullable field and the correspondign check?



##########
flink-dstl/flink-dstl-dfs/src/main/java/org/apache/flink/changelog/fs/StateChangeIteratorWithCache.java:
##########
@@ -0,0 +1,367 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.changelog.fs;
+
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.ConfigurationUtils;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.core.memory.DataInputViewStreamWrapper;
+import org.apache.flink.runtime.state.StreamStateHandle;
+import org.apache.flink.runtime.state.changelog.StateChange;
+import org.apache.flink.runtime.state.filesystem.FileStateHandle;
+import org.apache.flink.util.CloseableIterator;
+import org.apache.flink.util.IOUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.util.UUID;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import static org.apache.flink.configuration.StateChangelogOptions.PERIODIC_MATERIALIZATION_INTERVAL;
+
+/** StateChangeIterator with local cache. */
+class StateChangeIteratorWithCache extends StateChangeIteratorImpl {
+    private static final Logger LOG = LoggerFactory.getLogger(StateChangeIteratorWithCache.class);
+
+    private static final String CACHE_FILE_PREFIX = "dstl-";
+
+    private final File cacheDir;
+    private final ConcurrentMap<Path, FileCache> cache = new ConcurrentHashMap<>();
+    private final ScheduledExecutorService cacheCleanScheduler;
+    private final ExecutorService downloadExecutor;
+    private final long cacheIdleMillis;
+
+    StateChangeIteratorWithCache(ExecutorService downloadExecutor, Configuration config) {
+        // TODO: 2022/5/31 add a new options for cache idle
+        long cacheIdleMillis = config.get(PERIODIC_MATERIALIZATION_INTERVAL).toMillis();
+        File cacheDir = ConfigurationUtils.getRandomTempDirectory(config);
+
+        this.cacheCleanScheduler =
+                SchedulerFactory.create(1, "ChangelogCacheFileCleanScheduler", LOG);
+        this.downloadExecutor = downloadExecutor;
+        this.cacheIdleMillis = cacheIdleMillis;
+        this.cacheDir = cacheDir;
+    }
+
+    @Override
+    public CloseableIterator<StateChange> read(StreamStateHandle handle, long offset)
+            throws IOException {
+
+        if (!(handle instanceof FileStateHandle)) {
+            return new StateChangeFormat().read(wrapAndSeek(handle.openInputStream(), offset));
+        }
+
+        FileStateHandle fileHandle = (FileStateHandle) handle;
+        DataInputStream input;
+
+        if (fileHandle.getFilePath().getFileSystem().isDistributedFS()) {
+
+            Path dfsPath = fileHandle.getFilePath();
+            FileCache fileCache =
+                    cache.computeIfAbsent(
+                            dfsPath,
+                            key -> {
+                                FileCache fCache = new FileCache(cacheDir);
+                                downloadExecutor.execute(() -> downloadFile(fileHandle, fCache));
+                                return fCache;
+                            });
+
+            FileInputStream fin = fileCache.openAndSeek(offset);
+
+            input =
+                    new DataInputStream(new BufferedInputStream(fin)) {
+                        @Override
+                        public void close() throws IOException {
+                            super.close();
+                            if (fileCache.getRefCount() == 0) {
+                                cacheCleanScheduler.schedule(
+                                        () -> cleanFileCache(dfsPath, fileCache),
+                                        cacheIdleMillis,
+                                        TimeUnit.MILLISECONDS);
+                            }
+                        }
+                    };
+        } else {
+            input = wrapAndSeek(handle.openInputStream(), offset);
+        }
+
+        return new StateChangeFormat().read(input);
+    }
+
+    private DataInputViewStreamWrapper wrapAndSeek(InputStream stream, long offset)
+            throws IOException {
+        DataInputViewStreamWrapper wrappedStream = wrap(stream);
+        if (offset != 0) {
+            LOG.debug("seek to {}", offset);
+            wrappedStream.skipBytesToRead((int) offset);
+        }
+        return wrappedStream;
+    }
+
+    private void downloadFile(FileStateHandle handle, FileCache fileCache) {
+        try {
+            IOUtils.copyBytes(
+                    wrap(handle.openInputStream()), fileCache.getOutputStreamForSaveCacheData());
+            LOG.debug(
+                    "download and decompress dstl file : {} to local cache file : {}",
+                    handle.getFilePath(),
+                    fileCache.getFilePath());
+
+        } catch (IOException e) {
+            fileCache.setSaveCacheDataException(e);
+        }
+    }
+
+    private void cleanFileCache(Path dfsPath, FileCache fileCache) {
+        if (fileCache.getRefCount() == 0) {
+            LOG.debug("clean local cache file : {}", fileCache.getFilePath());
+            cache.remove(dfsPath);
+            fileCache.discard();
+        }
+    }
+
+    static class FileCache {
+
+        private final File cacheDir;
+        private final AtomicLong writeInBytes;
+        private final AtomicBoolean writeComplete;
+        private final AtomicInteger refCount;
+        private final CountDownLatch readLatch;
+
+        private volatile File file;
+        private volatile FileOutputStream fo;
+        private volatile Exception saveCacheDataException;
+
+        FileCache(File cacheDir) {
+            this.cacheDir = cacheDir;
+            this.writeInBytes = new AtomicLong(0);
+            this.writeComplete = new AtomicBoolean(false);
+            this.refCount = new AtomicInteger(0);
+            this.readLatch = new CountDownLatch(1);
+        }
+
+        String getFilePath() {
+            return this.file.getAbsolutePath();
+        }
+
+        OutputStream getOutputStreamForSaveCacheData() throws IOException {
+            synchronized (this) {
+                if (fo == null) {
+                    file = File.createTempFile(CACHE_FILE_PREFIX, null, cacheDir);
+                    fo = new FileOutputStream(file);
+                    readLatch.countDown();
+                } else {
+                    throw new IllegalStateException("only can get OutputStream once !");
+                }
+            }
+
+            return new OutputStream() {
+                @Override
+                public void write(int b) throws IOException {
+                    fo.write(b);
+                    writeInBytes.incrementAndGet();
+                }
+
+                @Override
+                public void write(byte[] b, int off, int len) throws IOException {
+                    fo.write(b, off, len);
+                    writeInBytes.addAndGet(len);
+                }
+
+                @Override
+                public void close() throws IOException {
+                    fo.close();
+                    writeComplete.set(true);
+                }
+            };
+        }
+
+        void setSaveCacheDataException(Exception e) {
+            this.saveCacheDataException = e;
+        }
+
+        int getRefCount() {
+            return refCount.get();
+        }
+
+        private void handoverException() throws IOException {
+            if (saveCacheDataException != null) {
+                throw new IOException(
+                        "there is a exception when save data to cache file : ",
+                        saveCacheDataException);
+            }
+        }
+
+        FileInputStream open() throws IOException {
+            return open0();
+        }
+
+        FileInputStream openAndSeek(long offset) throws IOException {
+            FileInputStream fin = open0();
+            if (offset != 0) {
+                LOG.debug("seek to {}", offset);
+                fin.getChannel().position(offset);
+            }
+            return fin;
+        }
+
+        private FileInputStream open0() throws IOException {
+            try {
+                readLatch.await();
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+
+            refCount.incrementAndGet();
+
+            return new FileInputStream(file) {
+                private final String id = UUID.randomUUID().toString();

Review Comment:
   Not used?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@flink.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org