You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by zs...@apache.org on 2017/02/17 19:50:38 UTC
spark git commit: [SPARK-19517][SS] KafkaSource fails to initialize
partition offsets
Repository: spark
Updated Branches:
refs/heads/master 4cc06f4eb -> 1a3f5f8c5
[SPARK-19517][SS] KafkaSource fails to initialize partition offsets
## What changes were proposed in this pull request?
This patch fixes a bug in `KafkaSource` with the (de)serialization of the length of the JSON string that contains the initial partition offsets.
## How was this patch tested?
I ran the test suite for spark-sql-kafka-0-10.
Author: Roberto Agostino Vitillo <ra...@gmail.com>
Closes #16857 from vitillo/kafka_source_fix.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1a3f5f8c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1a3f5f8c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1a3f5f8c
Branch: refs/heads/master
Commit: 1a3f5f8c55d873aaf8145a8bc4867fc9902cf93d
Parents: 4cc06f4
Author: Roberto Agostino Vitillo <ra...@gmail.com>
Authored: Fri Feb 17 11:43:57 2017 -0800
Committer: Shixiong Zhu <sh...@databricks.com>
Committed: Fri Feb 17 11:44:18 2017 -0800
----------------------------------------------------------------------
dev/.rat-excludes | 1 +
.../apache/spark/sql/kafka010/KafkaSource.scala | 32 ++++--
...afka-source-initial-offset-version-2.1.0.bin | 1 +
.../spark/sql/kafka010/KafkaSourceSuite.scala | 104 +++++++++++++++++++
4 files changed, 131 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/1a3f5f8c/dev/.rat-excludes
----------------------------------------------------------------------
diff --git a/dev/.rat-excludes b/dev/.rat-excludes
index 6d24434..2355d40 100644
--- a/dev/.rat-excludes
+++ b/dev/.rat-excludes
@@ -105,3 +105,4 @@ org.apache.spark.scheduler.ExternalClusterManager
org.apache.spark.deploy.yarn.security.ServiceCredentialProvider
spark-warehouse
structured-streaming/*
+kafka-source-initial-offset-version-2.1.0.bin
http://git-wip-us.apache.org/repos/asf/spark/blob/1a3f5f8c/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
----------------------------------------------------------------------
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
index 9c5dcec..92b5d91 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -21,6 +21,7 @@ import java.{util => ju}
import java.io._
import java.nio.charset.StandardCharsets
+import org.apache.commons.io.IOUtils
import org.apache.kafka.common.TopicPartition
import org.apache.spark.SparkContext
@@ -97,16 +98,31 @@ private[kafka010] class KafkaSource(
val metadataLog =
new HDFSMetadataLog[KafkaSourceOffset](sqlContext.sparkSession, metadataPath) {
override def serialize(metadata: KafkaSourceOffset, out: OutputStream): Unit = {
- val bytes = metadata.json.getBytes(StandardCharsets.UTF_8)
- out.write(bytes.length)
- out.write(bytes)
+ out.write(0) // A zero byte is written to support Spark 2.1.0 (SPARK-19517)
+ val writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8))
+ writer.write(VERSION)
+ writer.write(metadata.json)
+ writer.flush
}
override def deserialize(in: InputStream): KafkaSourceOffset = {
- val length = in.read()
- val bytes = new Array[Byte](length)
- in.read(bytes)
- KafkaSourceOffset(SerializedOffset(new String(bytes, StandardCharsets.UTF_8)))
+ in.read() // A zero byte is read to support Spark 2.1.0 (SPARK-19517)
+ val content = IOUtils.toString(new InputStreamReader(in, StandardCharsets.UTF_8))
+ // HDFSMetadataLog guarantees that it never creates a partial file.
+ assert(content.length != 0)
+ if (content(0) == 'v') {
+ if (content.startsWith(VERSION)) {
+ KafkaSourceOffset(SerializedOffset(content.substring(VERSION.length)))
+ } else {
+ val versionInFile = content.substring(0, content.indexOf("\n"))
+ throw new IllegalStateException(
+ s"Unsupported format. Expected version is ${VERSION.stripLineEnd} " +
+ s"but was $versionInFile. Please upgrade your Spark.")
+ }
+ } else {
+ // The log was generated by Spark 2.1.0
+ KafkaSourceOffset(SerializedOffset(content))
+ }
}
}
@@ -335,6 +351,8 @@ private[kafka010] object KafkaSource {
| source option "failOnDataLoss" to "false".
""".stripMargin
+ private val VERSION = "v1\n"
+
def getSortedExecutorList(sc: SparkContext): Array[String] = {
val bm = sc.env.blockManager
bm.master.getPeers(bm.blockManagerId).toArray
http://git-wip-us.apache.org/repos/asf/spark/blob/1a3f5f8c/external/kafka-0-10-sql/src/test/resources/kafka-source-initial-offset-version-2.1.0.bin
----------------------------------------------------------------------
diff --git a/external/kafka-0-10-sql/src/test/resources/kafka-source-initial-offset-version-2.1.0.bin b/external/kafka-0-10-sql/src/test/resources/kafka-source-initial-offset-version-2.1.0.bin
new file mode 100644
index 0000000..ae928e7
--- /dev/null
+++ b/external/kafka-0-10-sql/src/test/resources/kafka-source-initial-offset-version-2.1.0.bin
@@ -0,0 +1 @@
+2{"kafka-initial-offset-2-1-0":{"2":0,"1":0,"0":0}}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/spark/blob/1a3f5f8c/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
----------------------------------------------------------------------
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index 211c8a5..4f82b13 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -17,7 +17,9 @@
package org.apache.spark.sql.kafka010
+import java.io._
import java.nio.charset.StandardCharsets.UTF_8
+import java.nio.file.{Files, Paths}
import java.util.Properties
import java.util.concurrent.ConcurrentLinkedQueue
import java.util.concurrent.atomic.AtomicInteger
@@ -141,6 +143,108 @@ class KafkaSourceSuite extends KafkaSourceTest {
private val topicId = new AtomicInteger(0)
+ testWithUninterruptibleThread(
+ "deserialization of initial offset with Spark 2.1.0") {
+ withTempDir { metadataPath =>
+ val topic = newTopic
+ testUtils.createTopic(topic, partitions = 3)
+
+ val provider = new KafkaSourceProvider
+ val parameters = Map(
+ "kafka.bootstrap.servers" -> testUtils.brokerAddress,
+ "subscribe" -> topic
+ )
+ val source = provider.createSource(spark.sqlContext, metadataPath.getAbsolutePath, None,
+ "", parameters)
+ source.getOffset.get // Write initial offset
+
+ // Make sure Spark 2.1.0 will throw an exception when reading the new log
+ intercept[java.lang.IllegalArgumentException] {
+ // Simulate how Spark 2.1.0 reads the log
+ val in = new FileInputStream(metadataPath.getAbsolutePath + "/0")
+ val length = in.read()
+ val bytes = new Array[Byte](length)
+ in.read(bytes)
+ KafkaSourceOffset(SerializedOffset(new String(bytes, UTF_8)))
+ }
+ }
+ }
+
+ testWithUninterruptibleThread("deserialization of initial offset written by Spark 2.1.0") {
+ withTempDir { metadataPath =>
+ val topic = "kafka-initial-offset-2-1-0"
+ testUtils.createTopic(topic, partitions = 3)
+
+ val provider = new KafkaSourceProvider
+ val parameters = Map(
+ "kafka.bootstrap.servers" -> testUtils.brokerAddress,
+ "subscribe" -> topic
+ )
+
+ val from = Paths.get(
+ getClass.getResource("/kafka-source-initial-offset-version-2.1.0.bin").getPath)
+ val to = Paths.get(s"${metadataPath.getAbsolutePath}/0")
+ Files.copy(from, to)
+
+ val source = provider.createSource(spark.sqlContext, metadataPath.getAbsolutePath, None,
+ "", parameters)
+ val deserializedOffset = source.getOffset.get
+ val referenceOffset = KafkaSourceOffset((topic, 0, 0L), (topic, 1, 0L), (topic, 2, 0L))
+ assert(referenceOffset == deserializedOffset)
+ }
+ }
+
+ testWithUninterruptibleThread("deserialization of initial offset written by future version") {
+ withTempDir { metadataPath =>
+ val futureMetadataLog =
+ new HDFSMetadataLog[KafkaSourceOffset](sqlContext.sparkSession,
+ metadataPath.getAbsolutePath) {
+ override def serialize(metadata: KafkaSourceOffset, out: OutputStream): Unit = {
+ out.write(0)
+ val writer = new BufferedWriter(new OutputStreamWriter(out, UTF_8))
+ writer.write(s"v0\n${metadata.json}")
+ writer.flush
+ }
+ }
+
+ val topic = newTopic
+ testUtils.createTopic(topic, partitions = 3)
+ val offset = KafkaSourceOffset((topic, 0, 0L), (topic, 1, 0L), (topic, 2, 0L))
+ futureMetadataLog.add(0, offset)
+
+ val provider = new KafkaSourceProvider
+ val parameters = Map(
+ "kafka.bootstrap.servers" -> testUtils.brokerAddress,
+ "subscribe" -> topic
+ )
+ val source = provider.createSource(spark.sqlContext, metadataPath.getAbsolutePath, None,
+ "", parameters)
+
+ val e = intercept[java.lang.IllegalStateException] {
+ source.getOffset.get // Read initial offset
+ }
+
+ assert(e.getMessage.contains("Please upgrade your Spark"))
+ }
+ }
+
+ test("(de)serialization of initial offsets") {
+ val topic = newTopic()
+ testUtils.createTopic(topic, partitions = 64)
+
+ val reader = spark
+ .readStream
+ .format("kafka")
+ .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+ .option("subscribe", topic)
+
+ testStream(reader.load)(
+ makeSureGetOffsetCalled,
+ StopStream,
+ StartStream(),
+ StopStream)
+ }
+
test("maxOffsetsPerTrigger") {
val topic = newTopic()
testUtils.createTopic(topic, partitions = 3)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org