You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@kafka.apache.org by GitBox <gi...@apache.org> on 2020/12/09 21:01:18 UTC

[GitHub] [kafka] hachikuji commented on a change in pull request #9713: KAFKA-10825 ZK ISR manager

hachikuji commented on a change in pull request #9713:
URL: https://github.com/apache/kafka/pull/9713#discussion_r539640590



##########
File path: core/src/main/scala/kafka/server/ZkIsrManager.scala
##########
@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package kafka.server
+
+import kafka.utils.{Logging, ReplicationUtils, Scheduler}
+import kafka.zk.KafkaZkClient
+import org.apache.kafka.common.TopicPartition
+import org.apache.kafka.common.protocol.Errors
+import org.apache.kafka.common.utils.Time
+
+import java.util.concurrent.TimeUnit
+import java.util.concurrent.atomic.AtomicLong
+import scala.collection.mutable
+
+/**
+ * @param checkIntervalMs How often to check for ISR
+ * @param maxDelayMs  Maximum time that an ISR change may be delayed before sending the notification
+ * @param lingerMs  Maximum time to await additional changes before sending the notification
+ */
+case class IsrChangePropagationConfig(checkIntervalMs: Long, maxDelayMs: Long, lingerMs: Long)
+
+object ZkIsrManager {
+  // This field is mutable to allow overriding change notification behavior in test cases
+  @volatile var DefaultIsrPropagationConfig: IsrChangePropagationConfig = IsrChangePropagationConfig(
+    checkIntervalMs = 2500,
+    lingerMs = 5000,
+    maxDelayMs = 60000,
+  )
+}
+
+class ZkIsrManager(scheduler: Scheduler, time: Time, zkClient: KafkaZkClient) extends AlterIsrManager with Logging {
+
+  private val isrChangeNotificationConfig = ZkIsrManager.DefaultIsrPropagationConfig
+  // Visible for testing
+  private[server] val isrChangeSet: mutable.Set[TopicPartition] = new mutable.HashSet[TopicPartition]()
+  private val lastIsrChangeMs = new AtomicLong(time.milliseconds())
+  private val lastIsrPropagationMs = new AtomicLong(time.milliseconds())
+
+  override def start(): Unit = {
+    scheduler.schedule("isr-change-propagation", maybePropagateIsrChanges _,
+      period = isrChangeNotificationConfig.checkIntervalMs, unit = TimeUnit.MILLISECONDS)
+  }
+
+  override def clearPending(topicPartition: TopicPartition): Unit = {
+    // Since we always immediately process ZK updates and never actually enqueue anything, there is nothing to
+    // clear here so this is a no-op. Even if there are changes that have not been propagated, the write to ZK
+    // has already happened, so we may as well send the notification to the controller.
+  }
+
+  override def enqueue(alterIsrItem: AlterIsrItem): Boolean = {

Review comment:
       The name "enqueue" suggests an asynchronous change. Wonder if there is another name we could use. Perhaps "submit" suggests less about the implementation?

##########
File path: core/src/main/scala/kafka/cluster/Partition.scala
##########
@@ -51,40 +51,8 @@ trait IsrChangeListener {
   def markFailed(): Unit
 }
 
-trait PartitionStateStore {
-  def fetchTopicConfig(): Properties
-  def shrinkIsr(controllerEpoch: Int, leaderAndIsr: LeaderAndIsr): Option[Int]
-  def expandIsr(controllerEpoch: Int, leaderAndIsr: LeaderAndIsr): Option[Int]
-}
-
-class ZkPartitionStateStore(topicPartition: TopicPartition,
-                            zkClient: KafkaZkClient) extends PartitionStateStore {
-
-  override def fetchTopicConfig(): Properties = {
-    val adminZkClient = new AdminZkClient(zkClient)
-    adminZkClient.fetchEntityConfig(ConfigType.Topic, topicPartition.topic)
-  }
-
-  override def shrinkIsr(controllerEpoch: Int, leaderAndIsr: LeaderAndIsr): Option[Int] = {
-    val newVersionOpt = updateIsr(controllerEpoch, leaderAndIsr)
-    newVersionOpt
-  }
-
-  override def expandIsr(controllerEpoch: Int, leaderAndIsr: LeaderAndIsr): Option[Int] = {
-    val newVersionOpt = updateIsr(controllerEpoch, leaderAndIsr)
-    newVersionOpt
-  }
-
-  private def updateIsr(controllerEpoch: Int, leaderAndIsr: LeaderAndIsr): Option[Int] = {
-    val (updateSucceeded, newVersion) = ReplicationUtils.updateLeaderAndIsr(zkClient, topicPartition,
-      leaderAndIsr, controllerEpoch)
-
-    if (updateSucceeded) {
-      Some(newVersion)
-    } else {
-      None
-    }
-  }
+trait TopicConfigProvider {
+  def get(): Properties

Review comment:
       nit: I wonder if it might be better to use a verb like `fetch` which suggests some overhead involved

##########
File path: core/src/main/scala/kafka/server/AlterIsrManager.scala
##########
@@ -46,13 +51,36 @@ trait AlterIsrManager {
   def clearPending(topicPartition: TopicPartition): Unit
 }
 
-case class AlterIsrItem(topicPartition: TopicPartition, leaderAndIsr: LeaderAndIsr, callback: Either[Errors, LeaderAndIsr] => Unit)
+case class AlterIsrItem(topicPartition: TopicPartition,
+                        leaderAndIsr: LeaderAndIsr,
+                        callback: Either[Errors, LeaderAndIsr] => Unit,
+                        controllerEpoch: Int = -1) // controllerEpoch needed for Zk impl

Review comment:
       We don't have to do it here, but I think the controller epoch is no longer needed. This was previously used as a sort of poor man's controller fencing, but now the controller has a stronger mechanism relying on conditional zk updates.
   
   By the way, do we need the default value? Especially the fact that it is a sentinel seems dangerous.

##########
File path: core/src/main/scala/kafka/server/ZkIsrManager.scala
##########
@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package kafka.server
+
+import kafka.utils.{Logging, ReplicationUtils, Scheduler}
+import kafka.zk.KafkaZkClient
+import org.apache.kafka.common.TopicPartition
+import org.apache.kafka.common.protocol.Errors
+import org.apache.kafka.common.utils.Time
+
+import java.util.concurrent.TimeUnit
+import java.util.concurrent.atomic.AtomicLong
+import scala.collection.mutable
+
+/**
+ * @param checkIntervalMs How often to check for ISR
+ * @param maxDelayMs  Maximum time that an ISR change may be delayed before sending the notification
+ * @param lingerMs  Maximum time to await additional changes before sending the notification
+ */
+case class IsrChangePropagationConfig(checkIntervalMs: Long, maxDelayMs: Long, lingerMs: Long)
+
+object ZkIsrManager {
+  // This field is mutable to allow overriding change notification behavior in test cases
+  @volatile var DefaultIsrPropagationConfig: IsrChangePropagationConfig = IsrChangePropagationConfig(
+    checkIntervalMs = 2500,
+    lingerMs = 5000,
+    maxDelayMs = 60000,
+  )
+}
+
+class ZkIsrManager(scheduler: Scheduler, time: Time, zkClient: KafkaZkClient) extends AlterIsrManager with Logging {
+
+  private val isrChangeNotificationConfig = ZkIsrManager.DefaultIsrPropagationConfig
+  // Visible for testing
+  private[server] val isrChangeSet: mutable.Set[TopicPartition] = new mutable.HashSet[TopicPartition]()
+  private val lastIsrChangeMs = new AtomicLong(time.milliseconds())
+  private val lastIsrPropagationMs = new AtomicLong(time.milliseconds())
+
+  override def start(): Unit = {
+    scheduler.schedule("isr-change-propagation", maybePropagateIsrChanges _,
+      period = isrChangeNotificationConfig.checkIntervalMs, unit = TimeUnit.MILLISECONDS)
+  }
+
+  override def clearPending(topicPartition: TopicPartition): Unit = {
+    // Since we always immediately process ZK updates and never actually enqueue anything, there is nothing to
+    // clear here so this is a no-op. Even if there are changes that have not been propagated, the write to ZK
+    // has already happened, so we may as well send the notification to the controller.
+  }
+
+  override def enqueue(alterIsrItem: AlterIsrItem): Boolean = {
+    debug(s"Writing new ISR " + alterIsrItem.leaderAndIsr.isr + " to ZooKeeper with version " +
+      alterIsrItem.leaderAndIsr.zkVersion + " for partition " + alterIsrItem.topicPartition)
+
+    val (updateSucceeded, newVersion) = ReplicationUtils.updateLeaderAndIsr(zkClient, alterIsrItem.topicPartition,
+      alterIsrItem.leaderAndIsr, alterIsrItem.controllerEpoch)
+
+    if (updateSucceeded) {
+      // Track which partitions need to be propagated to the controller
+      isrChangeSet synchronized {
+        isrChangeSet += alterIsrItem.topicPartition
+        lastIsrChangeMs.set(time.milliseconds())
+      }
+
+      // We actually need to apply the callback in another thread since Partition#sendAlterIsrRequest will write

Review comment:
       Hmm.. It feels a tad brittle to rely on locking assumptions like this. Perhaps we could let `enqueue` return a different response in the case that the change was applied immediately? For example, we could return `Option[Int]` to indicate that new version if the change was applied. Note that we can probably discard the boolean return type and let `enqueue` raise an illegal state exception directly since that is what `Partition` is already doing.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org