You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ge...@apache.org on 2020/08/21 05:19:05 UTC
[spark] branch branch-3.0 updated: [SPARK-32660][SQL][DOC] Show
Avro related API in documentation
This is an automated email from the ASF dual-hosted git repository.
gengliang pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 2932926 [SPARK-32660][SQL][DOC] Show Avro related API in documentation
2932926 is described below
commit 2932926a0ed8456ad73e1d34bee8ccc03293c635
Author: Gengliang Wang <ge...@databricks.com>
AuthorDate: Fri Aug 21 13:12:43 2020 +0800
[SPARK-32660][SQL][DOC] Show Avro related API in documentation
### What changes were proposed in this pull request?
Currently, the Avro related APIs are missing in the documentation https://spark.apache.org/docs/latest/api/scala/org/apache/spark/index.html . This PR is to:
1. Mark internal Avro related classes as private
2. Show Avro related API in Spark official API documentation
### Why are the changes needed?
Better documentation.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Build doc and preview:
![image](https://user-images.githubusercontent.com/1097932/90623042-d156ee00-e1ca-11ea-9edd-2c45b3001fd8.png)
![image](https://user-images.githubusercontent.com/1097932/90623047-d451de80-e1ca-11ea-94ba-02921b64d6f1.png)
![image](https://user-images.githubusercontent.com/1097932/90623058-d6b43880-e1ca-11ea-849a-b9ea9efe6527.png)
Closes #29476 from gengliangwang/avroAPIDoc.
Authored-by: Gengliang Wang <ge...@databricks.com>
Signed-off-by: Gengliang Wang <ge...@databricks.com>
(cherry picked from commit de141a32714fd2dbc4be2d540adabf328bbce2c4)
Signed-off-by: Gengliang Wang <ge...@databricks.com>
---
.../spark/sql/avro/SparkAvroKeyOutputFormat.java | 4 ++--
.../apache/spark/sql/avro/AvroDataToCatalyst.scala | 2 +-
.../apache/spark/sql/avro/AvroDeserializer.scala | 2 +-
.../org/apache/spark/sql/avro/AvroOptions.scala | 4 ++--
.../org/apache/spark/sql/avro/AvroSerializer.scala | 2 +-
.../scala/org/apache/spark/sql/avro/AvroUtils.scala | 2 +-
.../apache/spark/sql/avro/CatalystDataToAvro.scala | 2 +-
.../apache/spark/sql/avro/SchemaConverters.scala | 21 ++++++++++++++++++---
project/SparkBuild.scala | 5 +++--
9 files changed, 30 insertions(+), 14 deletions(-)
diff --git a/external/avro/src/main/java/org/apache/spark/sql/avro/SparkAvroKeyOutputFormat.java b/external/avro/src/main/java/org/apache/spark/sql/avro/SparkAvroKeyOutputFormat.java
index 55696a6..a455584 100644
--- a/external/avro/src/main/java/org/apache/spark/sql/avro/SparkAvroKeyOutputFormat.java
+++ b/external/avro/src/main/java/org/apache/spark/sql/avro/SparkAvroKeyOutputFormat.java
@@ -35,8 +35,8 @@ import org.apache.hadoop.mapreduce.TaskAttemptContext;
// A variant of `AvroKeyOutputFormat`, which is used to inject the custom `RecordWriterFactory` so
// that we can set avro file metadata.
-public class SparkAvroKeyOutputFormat extends AvroKeyOutputFormat<GenericRecord> {
- public SparkAvroKeyOutputFormat(Map<String, String> metadata) {
+class SparkAvroKeyOutputFormat extends AvroKeyOutputFormat<GenericRecord> {
+ SparkAvroKeyOutputFormat(Map<String, String> metadata) {
super(new SparkRecordWriterFactory(metadata));
}
diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDataToCatalyst.scala b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDataToCatalyst.scala
index 79c7205..0fdf5c6 100644
--- a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDataToCatalyst.scala
+++ b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDataToCatalyst.scala
@@ -30,7 +30,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGe
import org.apache.spark.sql.catalyst.util.{FailFastMode, ParseMode, PermissiveMode}
import org.apache.spark.sql.types._
-case class AvroDataToCatalyst(
+private[avro] case class AvroDataToCatalyst(
child: Expression,
jsonFormatSchema: String,
options: Map[String, String])
diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala
index 4fc8040..7580957 100644
--- a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala
+++ b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala
@@ -42,7 +42,7 @@ import org.apache.spark.unsafe.types.UTF8String
/**
* A deserializer to deserialize data in avro format to data in catalyst format.
*/
-class AvroDeserializer(
+private[sql] class AvroDeserializer(
rootAvroType: Schema,
rootCatalystType: DataType,
datetimeRebaseMode: LegacyBehaviorPolicy.Value) {
diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroOptions.scala b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroOptions.scala
index f3ea785..8972b05 100644
--- a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroOptions.scala
+++ b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroOptions.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.internal.SQLConf
/**
* Options for Avro Reader and Writer stored in case insensitive manner.
*/
-class AvroOptions(
+private[sql] class AvroOptions(
@transient val parameters: CaseInsensitiveMap[String],
@transient val conf: Configuration) extends Logging with Serializable {
@@ -95,7 +95,7 @@ class AvroOptions(
parameters.get("mode").map(ParseMode.fromString).getOrElse(FailFastMode)
}
-object AvroOptions {
+private[sql] object AvroOptions {
def apply(parameters: Map[String, String]): AvroOptions = {
val hadoopConf = SparkSession
.getActiveSession
diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroSerializer.scala b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroSerializer.scala
index d6cfbc5..2c0bffa 100644
--- a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroSerializer.scala
+++ b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroSerializer.scala
@@ -43,7 +43,7 @@ import org.apache.spark.sql.types._
/**
* A serializer to serialize data in catalyst format to data in avro format.
*/
-class AvroSerializer(
+private[sql] class AvroSerializer(
rootCatalystType: DataType,
rootAvroType: Schema,
nullable: Boolean,
diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala
index 70dcd58..9ff89f6 100644
--- a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala
+++ b/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala
@@ -37,7 +37,7 @@ import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._
import org.apache.spark.util.Utils
-object AvroUtils extends Logging {
+private[sql] object AvroUtils extends Logging {
def inferSchema(
spark: SparkSession,
options: Map[String, String],
diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/CatalystDataToAvro.scala b/external/avro/src/main/scala/org/apache/spark/sql/avro/CatalystDataToAvro.scala
index 7732c83..53910b7 100644
--- a/external/avro/src/main/scala/org/apache/spark/sql/avro/CatalystDataToAvro.scala
+++ b/external/avro/src/main/scala/org/apache/spark/sql/avro/CatalystDataToAvro.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.expressions.{Expression, UnaryExpression}
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.types.{BinaryType, DataType}
-case class CatalystDataToAvro(
+private[avro] case class CatalystDataToAvro(
child: Expression,
jsonFormatSchema: Option[String]) extends UnaryExpression {
diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala b/external/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala
index 3947d32..0e01fbf 100644
--- a/external/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala
+++ b/external/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala
@@ -24,6 +24,7 @@ import org.apache.avro.{LogicalTypes, Schema, SchemaBuilder}
import org.apache.avro.LogicalTypes.{Date, Decimal, TimestampMicros, TimestampMillis}
import org.apache.avro.Schema.Type._
+import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql.catalyst.util.RandomUUIDGenerator
import org.apache.spark.sql.types._
import org.apache.spark.sql.types.Decimal.{maxPrecisionForBytes, minBytesForPrecision}
@@ -32,21 +33,29 @@ import org.apache.spark.sql.types.Decimal.{maxPrecisionForBytes, minBytesForPrec
* This object contains method that are used to convert sparkSQL schemas to avro schemas and vice
* versa.
*/
+@DeveloperApi
object SchemaConverters {
private lazy val uuidGenerator = RandomUUIDGenerator(new Random().nextLong())
private lazy val nullSchema = Schema.create(Schema.Type.NULL)
+ /**
+ * Internal wrapper for SQL data type and nullability.
+ *
+ * @since 2.4.0
+ */
case class SchemaType(dataType: DataType, nullable: Boolean)
/**
- * This function takes an avro schema and returns a sql schema.
+ * Converts an Avro schema to a corresponding Spark SQL schema.
+ *
+ * @since 2.4.0
*/
def toSqlType(avroSchema: Schema): SchemaType = {
toSqlTypeHelper(avroSchema, Set.empty)
}
- def toSqlTypeHelper(avroSchema: Schema, existingRecordNames: Set[String]): SchemaType = {
+ private def toSqlTypeHelper(avroSchema: Schema, existingRecordNames: Set[String]): SchemaType = {
avroSchema.getType match {
case INT => avroSchema.getLogicalType match {
case _: Date => SchemaType(DateType, nullable = false)
@@ -133,6 +142,11 @@ object SchemaConverters {
}
}
+ /**
+ * Converts a Spark SQL schema to a corresponding Avro schema.
+ *
+ * @since 2.4.0
+ */
def toAvroType(
catalystType: DataType,
nullable: Boolean = false,
@@ -192,4 +206,5 @@ object SchemaConverters {
}
}
-class IncompatibleSchemaException(msg: String, ex: Throwable = null) extends Exception(msg, ex)
+private[avro] class IncompatibleSchemaException(
+ msg: String, ex: Throwable = null) extends Exception(msg, ex)
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 7a9634a..a774a11 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -847,6 +847,7 @@ object Unidoc {
.map(_.filterNot(_.getCanonicalPath.contains("org/apache/spark/sql/hive/test")))
.map(_.filterNot(_.getCanonicalPath.contains("org/apache/spark/sql/catalog/v2/utils")))
.map(_.filterNot(_.getCanonicalPath.contains("org/apache/hive")))
+ .map(_.filterNot(_.getCanonicalPath.contains("org/apache/spark/sql/v2/avro")))
}
private def ignoreClasspaths(classpaths: Seq[Classpath]): Seq[Classpath] = {
@@ -862,10 +863,10 @@ object Unidoc {
unidocProjectFilter in(ScalaUnidoc, unidoc) :=
inAnyProject -- inProjects(OldDeps.project, repl, examples, tools, kubernetes,
- yarn, tags, streamingKafka010, sqlKafka010, avro),
+ yarn, tags, streamingKafka010, sqlKafka010),
unidocProjectFilter in(JavaUnidoc, unidoc) :=
inAnyProject -- inProjects(OldDeps.project, repl, examples, tools, kubernetes,
- yarn, tags, streamingKafka010, sqlKafka010, avro),
+ yarn, tags, streamingKafka010, sqlKafka010),
unidocAllClasspaths in (ScalaUnidoc, unidoc) := {
ignoreClasspaths((unidocAllClasspaths in (ScalaUnidoc, unidoc)).value)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org