You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by "ryan-johnson-databricks (via GitHub)" <gi...@apache.org> on 2023/05/11 19:31:58 UTC

[GitHub] [spark] ryan-johnson-databricks commented on a diff in pull request #40982: [SPARK-43300][CORE] NonFateSharingCache wrapper for Guava Cache

ryan-johnson-databricks commented on code in PR #40982:
URL: https://github.com/apache/spark/pull/40982#discussion_r1191590888


##########
core/src/test/scala/org/apache/spark/util/NonFateSharingCacheSuite.scala:
##########
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util;
+
+import java.util.concurrent.ExecutionException
+import java.util.concurrent.Semaphore
+import java.util.concurrent.atomic.AtomicReference
+
+import com.google.common.cache.CacheBuilder
+import com.google.common.cache.CacheLoader
+
+import org.apache.spark.SparkFunSuite
+
+object NonFateSharingCacheSuite {
+  private val TEST_KEY = "key"
+  private val FAIL_MESSAGE = "loading failed"
+  private val THREAD2_HOLDER = new AtomicReference[Thread](null)
+
+  class TestCacheLoader extends CacheLoader[String, String] {
+    var intentionalFail: ThreadLocal[Boolean] = ThreadLocal.withInitial(() => false)
+    var startLoading = new Semaphore(0)
+
+    def waitUntilThread2Waiting(): Unit = {
+      while (true) {
+        Thread.sleep(100)
+        val t2 = THREAD2_HOLDER.get()
+        if (t2 != null && t2.getState.equals(Thread.State.WAITING)) {
+          return
+        }
+      }
+    }
+
+    override def load(key: String): String = {
+      startLoading.release()
+      if (Thread.currentThread().getName.contains("test-executor1")) {
+        waitUntilThread2Waiting()
+      }
+      if (intentionalFail.get) throw new RuntimeException(FAIL_MESSAGE)
+      key
+    }
+  }
+}
+
+/**
+ * Test non-fate-sharing behavior
+ */
+class NonFateSharingCacheSuite extends SparkFunSuite {
+
+  type WorkerFunc = () => Unit
+
+  import NonFateSharingCacheSuite._
+
+  test("test LoadingCache") {
+    val loader = new TestCacheLoader
+    val loadingCache: NonFateSharingLoadingCache[String, String] =
+      NonFateSharingCache(CacheBuilder.newBuilder.build(loader))
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      loadingCache.get(TEST_KEY)
+    }
+    val thread2Task: WorkerFunc = () => {
+      loadingCache.get(TEST_KEY)
+    }
+    testImpl(loadingCache, loader, thread1Task, thread2Task)
+  }
+
+  test("test LoadingCache mix usage of default loader and provided loader") {
+    // Intentionally mix usage of default loader and provided value loader.
+    val loader = new TestCacheLoader
+    val loadingCache: NonFateSharingLoadingCache[String, String] =
+      NonFateSharingCache(CacheBuilder.newBuilder.build(loader))
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      loadingCache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)
+      )
+    }
+    val thread2Task: WorkerFunc = () => {
+      loadingCache.get(TEST_KEY)
+    }
+    testImpl(loadingCache, loader, thread1Task, thread2Task)
+  }
+
+  test("test Cache") {
+    val loader = new TestCacheLoader
+    val cache = NonFateSharingCache(CacheBuilder.newBuilder.build[String, String])
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      cache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)
+      )
+    }
+    val thread2Task: WorkerFunc = () => {
+      cache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)
+      )
+    }
+    testImpl(cache, loader, thread1Task, thread2Task)
+  }
+
+  def testImpl(
+    cache: NonFateSharingCache[String, String],
+    loader: TestCacheLoader,
+    thread1Task: WorkerFunc,
+    thread2Task: WorkerFunc): Unit = {
+    val executor1 = ThreadUtils.newDaemonSingleThreadExecutor("test-executor1")
+    val executor2 = ThreadUtils.newDaemonSingleThreadExecutor("test-executor2")
+    val f1 = executor1.submit(new Runnable {
+      override def run(): Unit = {
+        thread1Task()

Review Comment:
   nit: can be a one-liner:
   ```scala
   override def run(): Unit = thread1Task()
   ```
   (but first see runnable interface comment below)



##########
core/src/test/scala/org/apache/spark/util/NonFateSharingCacheSuite.scala:
##########
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util;
+
+import java.util.concurrent.ExecutionException
+import java.util.concurrent.Semaphore
+import java.util.concurrent.atomic.AtomicReference
+
+import com.google.common.cache.CacheBuilder
+import com.google.common.cache.CacheLoader
+
+import org.apache.spark.SparkFunSuite
+
+object NonFateSharingCacheSuite {
+  private val TEST_KEY = "key"
+  private val FAIL_MESSAGE = "loading failed"
+  private val THREAD2_HOLDER = new AtomicReference[Thread](null)
+
+  class TestCacheLoader extends CacheLoader[String, String] {
+    var intentionalFail: ThreadLocal[Boolean] = ThreadLocal.withInitial(() => false)
+    var startLoading = new Semaphore(0)
+
+    def waitUntilThread2Waiting(): Unit = {
+      while (true) {
+        Thread.sleep(100)
+        val t2 = THREAD2_HOLDER.get()
+        if (t2 != null && t2.getState.equals(Thread.State.WAITING)) {
+          return
+        }
+      }
+    }
+
+    override def load(key: String): String = {
+      startLoading.release()
+      if (Thread.currentThread().getName.contains("test-executor1")) {
+        waitUntilThread2Waiting()
+      }
+      if (intentionalFail.get) throw new RuntimeException(FAIL_MESSAGE)
+      key
+    }
+  }
+}
+
+/**
+ * Test non-fate-sharing behavior
+ */
+class NonFateSharingCacheSuite extends SparkFunSuite {
+
+  type WorkerFunc = () => Unit
+
+  import NonFateSharingCacheSuite._
+
+  test("test LoadingCache") {
+    val loader = new TestCacheLoader
+    val loadingCache: NonFateSharingLoadingCache[String, String] =
+      NonFateSharingCache(CacheBuilder.newBuilder.build(loader))
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      loadingCache.get(TEST_KEY)
+    }
+    val thread2Task: WorkerFunc = () => {
+      loadingCache.get(TEST_KEY)
+    }
+    testImpl(loadingCache, loader, thread1Task, thread2Task)
+  }
+
+  test("test LoadingCache mix usage of default loader and provided loader") {
+    // Intentionally mix usage of default loader and provided value loader.
+    val loader = new TestCacheLoader
+    val loadingCache: NonFateSharingLoadingCache[String, String] =
+      NonFateSharingCache(CacheBuilder.newBuilder.build(loader))
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      loadingCache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)
+      )
+    }
+    val thread2Task: WorkerFunc = () => {
+      loadingCache.get(TEST_KEY)
+    }
+    testImpl(loadingCache, loader, thread1Task, thread2Task)
+  }
+
+  test("test Cache") {
+    val loader = new TestCacheLoader
+    val cache = NonFateSharingCache(CacheBuilder.newBuilder.build[String, String])
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      cache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)
+      )
+    }
+    val thread2Task: WorkerFunc = () => {
+      cache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)
+      )
+    }
+    testImpl(cache, loader, thread1Task, thread2Task)
+  }
+
+  def testImpl(
+    cache: NonFateSharingCache[String, String],
+    loader: TestCacheLoader,
+    thread1Task: WorkerFunc,
+    thread2Task: WorkerFunc): Unit = {

Review Comment:
   nit: indent these 4 spaces?



##########
core/src/test/scala/org/apache/spark/util/NonFateSharingCacheSuite.scala:
##########
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util;
+
+import java.util.concurrent.ExecutionException
+import java.util.concurrent.Semaphore
+import java.util.concurrent.atomic.AtomicReference
+
+import com.google.common.cache.CacheBuilder
+import com.google.common.cache.CacheLoader
+
+import org.apache.spark.SparkFunSuite
+
+object NonFateSharingCacheSuite {
+  private val TEST_KEY = "key"
+  private val FAIL_MESSAGE = "loading failed"
+  private val THREAD2_HOLDER = new AtomicReference[Thread](null)
+
+  class TestCacheLoader extends CacheLoader[String, String] {
+    var intentionalFail: ThreadLocal[Boolean] = ThreadLocal.withInitial(() => false)
+    var startLoading = new Semaphore(0)
+
+    def waitUntilThread2Waiting(): Unit = {
+      while (true) {
+        Thread.sleep(100)
+        val t2 = THREAD2_HOLDER.get()
+        if (t2 != null && t2.getState.equals(Thread.State.WAITING)) {
+          return
+        }
+      }
+    }
+
+    override def load(key: String): String = {
+      startLoading.release()
+      if (Thread.currentThread().getName.contains("test-executor1")) {
+        waitUntilThread2Waiting()
+      }
+      if (intentionalFail.get) throw new RuntimeException(FAIL_MESSAGE)
+      key
+    }
+  }
+}
+
+/**
+ * Test non-fate-sharing behavior
+ */
+class NonFateSharingCacheSuite extends SparkFunSuite {
+
+  type WorkerFunc = () => Unit
+
+  import NonFateSharingCacheSuite._
+
+  test("test LoadingCache") {
+    val loader = new TestCacheLoader
+    val loadingCache: NonFateSharingLoadingCache[String, String] =
+      NonFateSharingCache(CacheBuilder.newBuilder.build(loader))
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      loadingCache.get(TEST_KEY)
+    }
+    val thread2Task: WorkerFunc = () => {
+      loadingCache.get(TEST_KEY)
+    }
+    testImpl(loadingCache, loader, thread1Task, thread2Task)
+  }
+
+  test("test LoadingCache mix usage of default loader and provided loader") {
+    // Intentionally mix usage of default loader and provided value loader.
+    val loader = new TestCacheLoader
+    val loadingCache: NonFateSharingLoadingCache[String, String] =
+      NonFateSharingCache(CacheBuilder.newBuilder.build(loader))
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      loadingCache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)

Review Comment:
   nit: any particular reason for the line breaks?
   ```scala
   loadingCache.get(TEST_KEY, () => loader.load(TEST_KEY))
   ```



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala:
##########
@@ -1577,7 +1577,7 @@ object CodeGenerator extends Logging {
    * automatically, in order to constrain its memory footprint.  Note that this cache does not use
    * weak keys/values and thus does not respond to memory pressure.
    */
-  private val cache = CacheBuilder.newBuilder()
+  private val cache = NonFateSharingCache(CacheBuilder.newBuilder()

Review Comment:
   I don't see any comment explaining why codegen needs a non fate sharing cache? What bug does it fix?



##########
core/src/main/scala/org/apache/spark/util/NonFateSharingCache.scala:
##########
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import java.util.concurrent.Callable
+
+import com.google.common.cache.Cache
+import com.google.common.cache.LoadingCache
+
+/**
+ * SPARK-43300: Guava cache fate-sharing behavior might lead to unexpected cascade failure:
+ * when multiple threads access the same key in the cache at the same time when the key is not in
+ * the cache, Guava cache will block all requests and load the data only once. If the loading fails,
+ * all requests will fail immediately without retry. Therefore individual failure will also fail
+ * other irrelevant queries who are waiting for the same key.
+ *
+ * This util create a delegation Cache with KeyLock to synchronize threads looking for the same key
+ * so that they should run individually and fail as if they had arrived one at a time.
+ *
+ * Instead of implementing Guava Cache and LoadingCache interface, we defined our own narrower APIs
+ * so that we can control at compile time what cache operations are allowed. Feel free to add new
+ * APIs when needed.

Review Comment:
   I don't think the "feel free to..." comment is especially helpful... can probably just remove it.



##########
core/src/main/scala/org/apache/spark/util/NonFateSharingCache.scala:
##########
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import java.util.concurrent.Callable
+
+import com.google.common.cache.Cache
+import com.google.common.cache.LoadingCache
+
+/**
+ * SPARK-43300: Guava cache fate-sharing behavior might lead to unexpected cascade failure:
+ * when multiple threads access the same key in the cache at the same time when the key is not in
+ * the cache, Guava cache will block all requests and load the data only once. If the loading fails,
+ * all requests will fail immediately without retry. Therefore individual failure will also fail
+ * other irrelevant queries who are waiting for the same key.
+ *
+ * This util create a delegation Cache with KeyLock to synchronize threads looking for the same key
+ * so that they should run individually and fail as if they had arrived one at a time.
+ *
+ * Instead of implementing Guava Cache and LoadingCache interface, we defined our own narrower APIs
+ * so that we can control at compile time what cache operations are allowed. Feel free to add new
+ * APIs when needed.
+ */
+object NonFateSharingCache {
+  def apply[K, V](cache: Cache[K, V]): NonFateSharingCache[K, V] = cache match {

Review Comment:
   Worth a quick doc comment to explain that this method will return a `NonFateSharingLoadingCache` if the user happens to pass a `LoadingCache`, as a courtesy to the user?



##########
core/src/main/scala/org/apache/spark/util/NonFateSharingCache.scala:
##########
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import java.util.concurrent.Callable
+
+import com.google.common.cache.Cache
+import com.google.common.cache.LoadingCache
+
+/**
+ * SPARK-43300: Guava cache fate-sharing behavior might lead to unexpected cascade failure:
+ * when multiple threads access the same key in the cache at the same time when the key is not in
+ * the cache, Guava cache will block all requests and load the data only once. If the loading fails,
+ * all requests will fail immediately without retry. Therefore individual failure will also fail
+ * other irrelevant queries who are waiting for the same key.
+ *
+ * This util create a delegation Cache with KeyLock to synchronize threads looking for the same key
+ * so that they should run individually and fail as if they had arrived one at a time.
+ *
+ * Instead of implementing Guava Cache and LoadingCache interface, we defined our own narrower APIs
+ * so that we can control at compile time what cache operations are allowed. Feel free to add new
+ * APIs when needed.

Review Comment:
   Meanwhile, it would probably be helpful to explain _why_ we don't/can't/won't expose the full Guava cache API?



##########
core/src/test/scala/org/apache/spark/util/NonFateSharingCacheSuite.scala:
##########
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util;
+
+import java.util.concurrent.ExecutionException
+import java.util.concurrent.Semaphore
+import java.util.concurrent.atomic.AtomicReference
+
+import com.google.common.cache.CacheBuilder
+import com.google.common.cache.CacheLoader
+
+import org.apache.spark.SparkFunSuite
+
+object NonFateSharingCacheSuite {
+  private val TEST_KEY = "key"
+  private val FAIL_MESSAGE = "loading failed"
+  private val THREAD2_HOLDER = new AtomicReference[Thread](null)
+
+  class TestCacheLoader extends CacheLoader[String, String] {
+    var intentionalFail: ThreadLocal[Boolean] = ThreadLocal.withInitial(() => false)
+    var startLoading = new Semaphore(0)
+
+    def waitUntilThread2Waiting(): Unit = {
+      while (true) {
+        Thread.sleep(100)
+        val t2 = THREAD2_HOLDER.get()
+        if (t2 != null && t2.getState.equals(Thread.State.WAITING)) {
+          return
+        }
+      }
+    }
+
+    override def load(key: String): String = {
+      startLoading.release()
+      if (Thread.currentThread().getName.contains("test-executor1")) {
+        waitUntilThread2Waiting()
+      }
+      if (intentionalFail.get) throw new RuntimeException(FAIL_MESSAGE)
+      key
+    }
+  }
+}
+
+/**
+ * Test non-fate-sharing behavior
+ */
+class NonFateSharingCacheSuite extends SparkFunSuite {
+
+  type WorkerFunc = () => Unit
+
+  import NonFateSharingCacheSuite._
+
+  test("test LoadingCache") {
+    val loader = new TestCacheLoader
+    val loadingCache: NonFateSharingLoadingCache[String, String] =
+      NonFateSharingCache(CacheBuilder.newBuilder.build(loader))
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      loadingCache.get(TEST_KEY)
+    }
+    val thread2Task: WorkerFunc = () => {
+      loadingCache.get(TEST_KEY)
+    }
+    testImpl(loadingCache, loader, thread1Task, thread2Task)
+  }
+
+  test("test LoadingCache mix usage of default loader and provided loader") {
+    // Intentionally mix usage of default loader and provided value loader.
+    val loader = new TestCacheLoader
+    val loadingCache: NonFateSharingLoadingCache[String, String] =
+      NonFateSharingCache(CacheBuilder.newBuilder.build(loader))
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      loadingCache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)
+      )
+    }
+    val thread2Task: WorkerFunc = () => {
+      loadingCache.get(TEST_KEY)
+    }
+    testImpl(loadingCache, loader, thread1Task, thread2Task)
+  }
+
+  test("test Cache") {
+    val loader = new TestCacheLoader
+    val cache = NonFateSharingCache(CacheBuilder.newBuilder.build[String, String])
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      cache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)
+      )
+    }
+    val thread2Task: WorkerFunc = () => {
+      cache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)
+      )
+    }
+    testImpl(cache, loader, thread1Task, thread2Task)
+  }
+
+  def testImpl(
+    cache: NonFateSharingCache[String, String],
+    loader: TestCacheLoader,
+    thread1Task: WorkerFunc,
+    thread2Task: WorkerFunc): Unit = {
+    val executor1 = ThreadUtils.newDaemonSingleThreadExecutor("test-executor1")
+    val executor2 = ThreadUtils.newDaemonSingleThreadExecutor("test-executor2")
+    val f1 = executor1.submit(new Runnable {
+      override def run(): Unit = {
+        thread1Task()
+      }
+    })
+    val f2 = executor2.submit(new Runnable {
+      override def run(): Unit = {
+        loader.startLoading.acquire() // wait until thread1 start loading
+        THREAD2_HOLDER.set(Thread.currentThread())
+        thread2Task()
+      }
+    })

Review Comment:
   Double check, but I'm pretty sure that this can simplify because runnable is a callable interface:
   ```scala
   val f1 = executor1.submit(() => thread1Task())
   val f2 = executor1.submit { () =>
     ...
   }
   ```
   



##########
core/src/test/scala/org/apache/spark/util/NonFateSharingCacheSuite.scala:
##########
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util;
+
+import java.util.concurrent.ExecutionException
+import java.util.concurrent.Semaphore
+import java.util.concurrent.atomic.AtomicReference
+
+import com.google.common.cache.CacheBuilder
+import com.google.common.cache.CacheLoader
+
+import org.apache.spark.SparkFunSuite
+
+object NonFateSharingCacheSuite {
+  private val TEST_KEY = "key"
+  private val FAIL_MESSAGE = "loading failed"
+  private val THREAD2_HOLDER = new AtomicReference[Thread](null)
+
+  class TestCacheLoader extends CacheLoader[String, String] {
+    var intentionalFail: ThreadLocal[Boolean] = ThreadLocal.withInitial(() => false)
+    var startLoading = new Semaphore(0)
+
+    def waitUntilThread2Waiting(): Unit = {
+      while (true) {
+        Thread.sleep(100)
+        val t2 = THREAD2_HOLDER.get()
+        if (t2 != null && t2.getState.equals(Thread.State.WAITING)) {
+          return
+        }
+      }
+    }
+
+    override def load(key: String): String = {
+      startLoading.release()
+      if (Thread.currentThread().getName.contains("test-executor1")) {
+        waitUntilThread2Waiting()
+      }
+      if (intentionalFail.get) throw new RuntimeException(FAIL_MESSAGE)
+      key
+    }
+  }
+}
+
+/**
+ * Test non-fate-sharing behavior
+ */
+class NonFateSharingCacheSuite extends SparkFunSuite {
+
+  type WorkerFunc = () => Unit
+
+  import NonFateSharingCacheSuite._
+
+  test("test LoadingCache") {
+    val loader = new TestCacheLoader
+    val loadingCache: NonFateSharingLoadingCache[String, String] =
+      NonFateSharingCache(CacheBuilder.newBuilder.build(loader))
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      loadingCache.get(TEST_KEY)
+    }
+    val thread2Task: WorkerFunc = () => {
+      loadingCache.get(TEST_KEY)
+    }
+    testImpl(loadingCache, loader, thread1Task, thread2Task)
+  }
+
+  test("test LoadingCache mix usage of default loader and provided loader") {
+    // Intentionally mix usage of default loader and provided value loader.
+    val loader = new TestCacheLoader
+    val loadingCache: NonFateSharingLoadingCache[String, String] =
+      NonFateSharingCache(CacheBuilder.newBuilder.build(loader))
+    val thread1Task: WorkerFunc = () => {
+      loader.intentionalFail.set(true)
+      loadingCache.get(
+        TEST_KEY,
+        () => loader.load(TEST_KEY)
+      )
+    }
+    val thread2Task: WorkerFunc = () => {
+      loadingCache.get(TEST_KEY)
+    }
+    testImpl(loadingCache, loader, thread1Task, thread2Task)
+  }
+
+  test("test Cache") {

Review Comment:
   Can we use more descriptive names for test cases? 
   Ideally it explains what behavior the test actually verifies.



##########
core/src/test/scala/org/apache/spark/util/NonFateSharingCacheSuite.scala:
##########
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util;
+
+import java.util.concurrent.ExecutionException
+import java.util.concurrent.Semaphore
+import java.util.concurrent.atomic.AtomicReference
+
+import com.google.common.cache.CacheBuilder
+import com.google.common.cache.CacheLoader
+
+import org.apache.spark.SparkFunSuite
+
+object NonFateSharingCacheSuite {
+  private val TEST_KEY = "key"
+  private val FAIL_MESSAGE = "loading failed"
+  private val THREAD2_HOLDER = new AtomicReference[Thread](null)
+
+  class TestCacheLoader extends CacheLoader[String, String] {
+    var intentionalFail: ThreadLocal[Boolean] = ThreadLocal.withInitial(() => false)
+    var startLoading = new Semaphore(0)
+
+    def waitUntilThread2Waiting(): Unit = {
+      while (true) {
+        Thread.sleep(100)
+        val t2 = THREAD2_HOLDER.get()
+        if (t2 != null && t2.getState.equals(Thread.State.WAITING)) {

Review Comment:
   nit: use Option instead of null check?
   ```scala
   if (Option(THREAD2_HOLDER.get()).exists(_.getState.equals(Thread.State.WAITING))) {
   ```



##########
core/src/main/scala/org/apache/spark/util/NonFateSharingCache.scala:
##########
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import java.util.concurrent.Callable
+
+import com.google.common.cache.Cache
+import com.google.common.cache.LoadingCache
+
+/**
+ * SPARK-43300: Guava cache fate-sharing behavior might lead to unexpected cascade failure:
+ * when multiple threads access the same key in the cache at the same time when the key is not in
+ * the cache, Guava cache will block all requests and load the data only once. If the loading fails,
+ * all requests will fail immediately without retry. Therefore individual failure will also fail
+ * other irrelevant queries who are waiting for the same key.
+ *
+ * This util create a delegation Cache with KeyLock to synchronize threads looking for the same key
+ * so that they should run individually and fail as if they had arrived one at a time.
+ *
+ * Instead of implementing Guava Cache and LoadingCache interface, we defined our own narrower APIs

Review Comment:
   ```suggestion
    * Instead of implementing Guava Cache and LoadingCache interface, we expose a subset of APIs
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org