You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2022/11/17 11:01:08 UTC
[spark] branch master updated: [SPARK-41164][CONNECT] Update relations.proto to follow Connect proto development guide
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new f01a8db4bcf [SPARK-41164][CONNECT] Update relations.proto to follow Connect proto development guide
f01a8db4bcf is described below
commit f01a8db4bcf09c4975029e85722053ff82f8a355
Author: Rui Wang <ru...@databricks.com>
AuthorDate: Thu Nov 17 20:00:52 2022 +0900
[SPARK-41164][CONNECT] Update relations.proto to follow Connect proto development guide
### What changes were proposed in this pull request?
As we have a guidance for Connect proto ([adding proto messages](https://github.com/apache/spark/blob/master/connector/connect/docs/adding-proto-messages.md)), this PR updates `relations.proto` to follow the development guide.
This PR also adds some missing documentation for the proto.
### Why are the changes needed?
1. Follow development guide.
2. Improve proto Documentation.
### Does this PR introduce _any_ user-facing change?
NO
### How was this patch tested?
Existing UT
Closes #38678 from amaliujia/improve_relation_proto_to_follow_proto_rules.
Authored-by: Rui Wang <ru...@databricks.com>
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
.../main/protobuf/spark/connect/relations.proto | 128 +++++++--
python/pyspark/sql/connect/proto/relations_pb2.py | 114 ++++----
python/pyspark/sql/connect/proto/relations_pb2.pyi | 302 +++++++++++++++------
3 files changed, 372 insertions(+), 172 deletions(-)
diff --git a/connector/connect/src/main/protobuf/spark/connect/relations.proto b/connector/connect/src/main/protobuf/spark/connect/relations.proto
index 8fa10c4e093..52ff780d093 100644
--- a/connector/connect/src/main/protobuf/spark/connect/relations.proto
+++ b/connector/connect/src/main/protobuf/spark/connect/relations.proto
@@ -67,11 +67,13 @@ message Unknown {}
// Common metadata of all relations.
message RelationCommon {
+ // (Required) Shared relation metadata.
string source_info = 1;
}
// Relation that uses a SQL query to generate the output.
message SQL {
+ // (Required) The SQL query.
string query = 1;
}
@@ -84,15 +86,20 @@ message Read {
}
message NamedTable {
+ // (Required) Unparsed identifier for the table.
string unparsed_identifier = 1;
}
message DataSource {
- // Required. Supported formats include: parquet, orc, text, json, parquet, csv, avro.
+ // (Required) Supported formats include: parquet, orc, text, json, parquet, csv, avro.
string format = 1;
- // Optional. If not set, Spark will infer the schema.
- string schema = 2;
- // The key is case insensitive.
+
+ // (Optional) If not set, Spark will infer the schema.
+ optional string schema = 2;
+
+ // Options for the data source. The context of this map varies based on the
+ // data source format. This options could be empty for valid data source format.
+ // The map key is case insensitive.
map<string, string> options = 3;
}
}
@@ -106,13 +113,18 @@ message Project {
//
// For example, `SELECT ABS(-1)` is valid plan without an input plan.
Relation input = 1;
+
+ // (Required) A Project requires at least one expression.
repeated Expression expressions = 3;
}
// Relation that applies a boolean expression `condition` on each row of `input` to produce
// the output result.
message Filter {
+ // (Required) Input relation for a Filter.
Relation input = 1;
+
+ // (Required) A Filter must have a condition expression.
Expression condition = 2;
}
@@ -120,10 +132,20 @@ message Filter {
//
// `left` and `right` must be present.
message Join {
+ // (Required) Left input relation for a Join.
Relation left = 1;
+
+ // (Required) Right input relation for a Join.
Relation right = 2;
+
+ // (Optional) The join condition. Could be unset when `using_columns` is utilized.
+ //
+ // This field does not co-exist with using_columns.
Expression join_condition = 3;
+
+ // (Required) The join type.
JoinType join_type = 4;
+
// Optional. using_columns provides a list of columns that should present on both sides of
// the join inputs that this Join will join on. For example A JOIN B USING col_name is
// equivalent to A JOIN B on A.col_name = B.col_name.
@@ -144,11 +166,25 @@ message Join {
// Relation of type [[SetOperation]]
message SetOperation {
+ // (Required) Left input relation for a Set operation.
Relation left_input = 1;
+
+ // (Required) Right input relation for a Set operation.
Relation right_input = 2;
+
+ // (Required) The Set operation type.
SetOpType set_op_type = 3;
- bool is_all = 4;
- bool by_name = 5;
+
+ // (Optional) If to remove duplicate rows.
+ //
+ // True to preserve all results.
+ // False to remove duplicate rows.
+ optional bool is_all = 4;
+
+ // (Optional) If to perform the Set operation based on name resolution.
+ //
+ // Only UNION supports this option.
+ optional bool by_name = 5;
enum SetOpType {
SET_OP_TYPE_UNSPECIFIED = 0;
@@ -160,29 +196,42 @@ message SetOperation {
// Relation of type [[Limit]] that is used to `limit` rows from the input relation.
message Limit {
+ // (Required) Input relation for a Limit.
Relation input = 1;
+
+ // (Required) the limit.
int32 limit = 2;
}
// Relation of type [[Offset]] that is used to read rows staring from the `offset` on
// the input relation.
message Offset {
+ // (Required) Input relation for an Offset.
Relation input = 1;
+
+ // (Required) the limit.
int32 offset = 2;
}
// Relation of type [[Aggregate]].
message Aggregate {
+ // (Required) Input relation for a Aggregate.
Relation input = 1;
+
repeated Expression grouping_expressions = 2;
repeated Expression result_expressions = 3;
}
// Relation of type [[Sort]].
message Sort {
+ // (Required) Input relation for a Sort.
Relation input = 1;
+
+ // (Required) Sort fields.
repeated SortField sort_fields = 2;
- bool is_global = 3;
+
+ // (Optional) if this is a global sort.
+ optional bool is_global = 3;
message SortField {
Expression expression = 1;
@@ -206,33 +255,56 @@ message Sort {
// Relation of type [[Deduplicate]] which have duplicate rows removed, could consider either only
// the subset of columns or all the columns.
message Deduplicate {
+ // (Required) Input relation for a Deduplicate.
Relation input = 1;
+
+ // (Optional) Deduplicate based on a list of column names.
+ //
+ // This field does not co-use with `all_columns_as_keys`.
repeated string column_names = 2;
- bool all_columns_as_keys = 3;
+
+ // (Optional) Deduplicate based on all the columns of the input relation.
+ //
+ // This field does not co-use with `column_names`.
+ optional bool all_columns_as_keys = 3;
}
+// A relation that does not need to be qualified by name.
message LocalRelation {
+ // (Optional) A list qualified attributes.
repeated Expression.QualifiedAttribute attributes = 1;
// TODO: support local data.
}
// Relation of type [[Sample]] that samples a fraction of the dataset.
message Sample {
+ // (Required) Input relation for a Sample.
Relation input = 1;
+
+ // (Required) lower bound.
double lower_bound = 2;
+
+ // (Required) upper bound.
double upper_bound = 3;
- bool with_replacement = 4;
+
+ // (Optional) Whether to sample with replacement.
+ optional bool with_replacement = 4;
+
+ // (Optional) The random seed.
optional int64 seed = 5;
}
// Relation of type [[Range]] that generates a sequence of integers.
message Range {
- // Optional. Default value = 0
- int64 start = 1;
- // Required.
+ // (Optional) Default value = 0
+ optional int64 start = 1;
+
+ // (Required)
int64 end = 2;
- // Required.
+
+ // (Required)
int64 step = 3;
+
// Optional. Default value is assigned by 1) SQL conf "spark.sql.leafNodeDefaultParallelism" if
// it is set, or 2) spark default parallelism.
optional int32 num_partitions = 4;
@@ -240,24 +312,26 @@ message Range {
// Relation alias.
message SubqueryAlias {
- // Required. The input relation.
+ // (Required) The input relation of SubqueryAlias.
Relation input = 1;
- // Required. The alias.
+
+ // (Required) The alias.
string alias = 2;
- // Optional. Qualifier of the alias.
+
+ // (Optional) Qualifier of the alias.
repeated string qualifier = 3;
}
// Relation repartition.
message Repartition {
- // Required. The input relation.
+ // (Required) The input relation of Repartition.
Relation input = 1;
- // Required. Must be positive.
+ // (Required) Must be positive.
int32 num_partitions = 2;
- // Optional. Default value is false.
- bool shuffle = 3;
+ // (Optional) Default value is false.
+ optional bool shuffle = 3;
}
// Compose the string representing rows for output.
@@ -267,14 +341,14 @@ message ShowString {
Relation input = 1;
// (Required) Number of rows to show.
- optional int32 numRows = 2;
+ int32 numRows = 2;
// (Required) If set to more than 0, truncates strings to
// `truncate` characters and all cells will be aligned right.
- optional int32 truncate = 3;
+ int32 truncate = 3;
// (Required) If set to true, prints output rows vertically (one line per column value).
- optional bool vertical = 4;
+ bool vertical = 4;
}
// Computes specified statistics for numeric and string columns.
@@ -344,10 +418,10 @@ message NAFill {
// Rename columns on the input relation by the same length of names.
message RenameColumnsBySameLengthNames {
- // Required. The input relation.
+ // (Required) The input relation of RenameColumnsBySameLengthNames.
Relation input = 1;
- // Required.
+ // (Required)
//
// The number of columns of the input relation must be equal to the length
// of this field. If this is not true, an exception will be returned.
@@ -357,11 +431,11 @@ message RenameColumnsBySameLengthNames {
// Rename columns on the input relation by a map with name to name mapping.
message RenameColumnsByNameToNameMap {
- // Required. The input relation.
+ // (Required) The input relation.
Relation input = 1;
- // Required.
+ // (Required)
//
// Renaming column names of input relation from A to B where A is the map key
// and B is the map value. This is a no-op if schema doesn't contain any A. It
diff --git a/python/pyspark/sql/connect/proto/relations_pb2.py b/python/pyspark/sql/connect/proto/relations_pb2.py
index 7e77d56eaa6..9bc5e75ea64 100644
--- a/python/pyspark/sql/connect/proto/relations_pb2.py
+++ b/python/pyspark/sql/connect/proto/relations_pb2.py
@@ -32,7 +32,7 @@ from pyspark.sql.connect.proto import expressions_pb2 as spark_dot_connect_dot_e
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
- b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x1fspark/connect/expressions.proto"\xa6\x0b\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01 \x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02 \x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03 \x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66ilter\x18\x04 \x01(\x0b\x32\x15.spark.connect.FilterH\x00R\x06\x66ilter\x12)\n\x04join\x18\x05 \x01(\x0 [...]
+ b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x1fspark/connect/expressions.proto"\xa6\x0b\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01 \x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02 \x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03 \x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66ilter\x18\x04 \x01(\x0b\x32\x15.spark.connect.FilterH\x00R\x06\x66ilter\x12)\n\x04join\x18\x05 \x01(\x0 [...]
)
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
@@ -54,63 +54,63 @@ if _descriptor._USE_C_DESCRIPTORS == False:
_SQL._serialized_start = 1592
_SQL._serialized_end = 1619
_READ._serialized_start = 1622
- _READ._serialized_end = 2032
+ _READ._serialized_end = 2048
_READ_NAMEDTABLE._serialized_start = 1764
_READ_NAMEDTABLE._serialized_end = 1825
_READ_DATASOURCE._serialized_start = 1828
- _READ_DATASOURCE._serialized_end = 2019
- _READ_DATASOURCE_OPTIONSENTRY._serialized_start = 1961
- _READ_DATASOURCE_OPTIONSENTRY._serialized_end = 2019
- _PROJECT._serialized_start = 2034
- _PROJECT._serialized_end = 2151
- _FILTER._serialized_start = 2153
- _FILTER._serialized_end = 2265
- _JOIN._serialized_start = 2268
- _JOIN._serialized_end = 2718
- _JOIN_JOINTYPE._serialized_start = 2531
- _JOIN_JOINTYPE._serialized_end = 2718
- _SETOPERATION._serialized_start = 2721
- _SETOPERATION._serialized_end = 3084
- _SETOPERATION_SETOPTYPE._serialized_start = 2970
- _SETOPERATION_SETOPTYPE._serialized_end = 3084
- _LIMIT._serialized_start = 3086
- _LIMIT._serialized_end = 3162
- _OFFSET._serialized_start = 3164
- _OFFSET._serialized_end = 3243
- _AGGREGATE._serialized_start = 3246
- _AGGREGATE._serialized_end = 3456
- _SORT._serialized_start = 3459
- _SORT._serialized_end = 3990
- _SORT_SORTFIELD._serialized_start = 3608
- _SORT_SORTFIELD._serialized_end = 3796
- _SORT_SORTDIRECTION._serialized_start = 3798
- _SORT_SORTDIRECTION._serialized_end = 3906
- _SORT_SORTNULLS._serialized_start = 3908
- _SORT_SORTNULLS._serialized_end = 3990
- _DEDUPLICATE._serialized_start = 3993
- _DEDUPLICATE._serialized_end = 4135
- _LOCALRELATION._serialized_start = 4137
- _LOCALRELATION._serialized_end = 4230
- _SAMPLE._serialized_start = 4233
- _SAMPLE._serialized_end = 4431
- _RANGE._serialized_start = 4434
- _RANGE._serialized_end = 4564
- _SUBQUERYALIAS._serialized_start = 4566
- _SUBQUERYALIAS._serialized_end = 4680
- _REPARTITION._serialized_start = 4682
- _REPARTITION._serialized_end = 4807
- _SHOWSTRING._serialized_start = 4810
- _SHOWSTRING._serialized_end = 5004
- _STATSUMMARY._serialized_start = 5006
- _STATSUMMARY._serialized_end = 5098
- _STATCROSSTAB._serialized_start = 5100
- _STATCROSSTAB._serialized_end = 5201
- _NAFILL._serialized_start = 5204
- _NAFILL._serialized_end = 5338
- _RENAMECOLUMNSBYSAMELENGTHNAMES._serialized_start = 5340
- _RENAMECOLUMNSBYSAMELENGTHNAMES._serialized_end = 5454
- _RENAMECOLUMNSBYNAMETONAMEMAP._serialized_start = 5457
- _RENAMECOLUMNSBYNAMETONAMEMAP._serialized_end = 5716
- _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_start = 5649
- _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_end = 5716
+ _READ_DATASOURCE._serialized_end = 2035
+ _READ_DATASOURCE_OPTIONSENTRY._serialized_start = 1966
+ _READ_DATASOURCE_OPTIONSENTRY._serialized_end = 2024
+ _PROJECT._serialized_start = 2050
+ _PROJECT._serialized_end = 2167
+ _FILTER._serialized_start = 2169
+ _FILTER._serialized_end = 2281
+ _JOIN._serialized_start = 2284
+ _JOIN._serialized_end = 2734
+ _JOIN_JOINTYPE._serialized_start = 2547
+ _JOIN_JOINTYPE._serialized_end = 2734
+ _SETOPERATION._serialized_start = 2737
+ _SETOPERATION._serialized_end = 3133
+ _SETOPERATION_SETOPTYPE._serialized_start = 2996
+ _SETOPERATION_SETOPTYPE._serialized_end = 3110
+ _LIMIT._serialized_start = 3135
+ _LIMIT._serialized_end = 3211
+ _OFFSET._serialized_start = 3213
+ _OFFSET._serialized_end = 3292
+ _AGGREGATE._serialized_start = 3295
+ _AGGREGATE._serialized_end = 3505
+ _SORT._serialized_start = 3508
+ _SORT._serialized_end = 4058
+ _SORT_SORTFIELD._serialized_start = 3662
+ _SORT_SORTFIELD._serialized_end = 3850
+ _SORT_SORTDIRECTION._serialized_start = 3852
+ _SORT_SORTDIRECTION._serialized_end = 3960
+ _SORT_SORTNULLS._serialized_start = 3962
+ _SORT_SORTNULLS._serialized_end = 4044
+ _DEDUPLICATE._serialized_start = 4061
+ _DEDUPLICATE._serialized_end = 4232
+ _LOCALRELATION._serialized_start = 4234
+ _LOCALRELATION._serialized_end = 4327
+ _SAMPLE._serialized_start = 4330
+ _SAMPLE._serialized_end = 4554
+ _RANGE._serialized_start = 4557
+ _RANGE._serialized_end = 4702
+ _SUBQUERYALIAS._serialized_start = 4704
+ _SUBQUERYALIAS._serialized_end = 4818
+ _REPARTITION._serialized_start = 4821
+ _REPARTITION._serialized_end = 4963
+ _SHOWSTRING._serialized_start = 4966
+ _SHOWSTRING._serialized_end = 5107
+ _STATSUMMARY._serialized_start = 5109
+ _STATSUMMARY._serialized_end = 5201
+ _STATCROSSTAB._serialized_start = 5203
+ _STATCROSSTAB._serialized_end = 5304
+ _NAFILL._serialized_start = 5307
+ _NAFILL._serialized_end = 5441
+ _RENAMECOLUMNSBYSAMELENGTHNAMES._serialized_start = 5443
+ _RENAMECOLUMNSBYSAMELENGTHNAMES._serialized_end = 5557
+ _RENAMECOLUMNSBYNAMETONAMEMAP._serialized_start = 5560
+ _RENAMECOLUMNSBYNAMETONAMEMAP._serialized_end = 5819
+ _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_start = 5752
+ _RENAMECOLUMNSBYNAMETONAMEMAP_RENAMECOLUMNSMAPENTRY._serialized_end = 5819
# @@protoc_insertion_point(module_scope)
diff --git a/python/pyspark/sql/connect/proto/relations_pb2.pyi b/python/pyspark/sql/connect/proto/relations_pb2.pyi
index 762ccd8c0e8..27c6db4e748 100644
--- a/python/pyspark/sql/connect/proto/relations_pb2.pyi
+++ b/python/pyspark/sql/connect/proto/relations_pb2.pyi
@@ -319,6 +319,7 @@ class RelationCommon(google.protobuf.message.Message):
SOURCE_INFO_FIELD_NUMBER: builtins.int
source_info: builtins.str
+ """(Required) Shared relation metadata."""
def __init__(
self,
*,
@@ -337,6 +338,7 @@ class SQL(google.protobuf.message.Message):
QUERY_FIELD_NUMBER: builtins.int
query: builtins.str
+ """(Required) The SQL query."""
def __init__(
self,
*,
@@ -358,6 +360,7 @@ class Read(google.protobuf.message.Message):
UNPARSED_IDENTIFIER_FIELD_NUMBER: builtins.int
unparsed_identifier: builtins.str
+ """(Required) Unparsed identifier for the table."""
def __init__(
self,
*,
@@ -392,27 +395,43 @@ class Read(google.protobuf.message.Message):
SCHEMA_FIELD_NUMBER: builtins.int
OPTIONS_FIELD_NUMBER: builtins.int
format: builtins.str
- """Required. Supported formats include: parquet, orc, text, json, parquet, csv, avro."""
+ """(Required) Supported formats include: parquet, orc, text, json, parquet, csv, avro."""
schema: builtins.str
- """Optional. If not set, Spark will infer the schema."""
+ """(Optional) If not set, Spark will infer the schema."""
@property
def options(
self,
) -> google.protobuf.internal.containers.ScalarMap[builtins.str, builtins.str]:
- """The key is case insensitive."""
+ """Options for the data source. The context of this map varies based on the
+ data source format. This options could be empty for valid data source format.
+ The map key is case insensitive.
+ """
def __init__(
self,
*,
format: builtins.str = ...,
- schema: builtins.str = ...,
+ schema: builtins.str | None = ...,
options: collections.abc.Mapping[builtins.str, builtins.str] | None = ...,
) -> None: ...
+ def HasField(
+ self, field_name: typing_extensions.Literal["_schema", b"_schema", "schema", b"schema"]
+ ) -> builtins.bool: ...
def ClearField(
self,
field_name: typing_extensions.Literal[
- "format", b"format", "options", b"options", "schema", b"schema"
+ "_schema",
+ b"_schema",
+ "format",
+ b"format",
+ "options",
+ b"options",
+ "schema",
+ b"schema",
],
) -> None: ...
+ def WhichOneof(
+ self, oneof_group: typing_extensions.Literal["_schema", b"_schema"]
+ ) -> typing_extensions.Literal["schema"] | None: ...
NAMED_TABLE_FIELD_NUMBER: builtins.int
DATA_SOURCE_FIELD_NUMBER: builtins.int
@@ -466,7 +485,8 @@ class Project(google.protobuf.message.Message):
self,
) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[
pyspark.sql.connect.proto.expressions_pb2.Expression
- ]: ...
+ ]:
+ """(Required) A Project requires at least one expression."""
def __init__(
self,
*,
@@ -494,9 +514,11 @@ class Filter(google.protobuf.message.Message):
INPUT_FIELD_NUMBER: builtins.int
CONDITION_FIELD_NUMBER: builtins.int
@property
- def input(self) -> global___Relation: ...
+ def input(self) -> global___Relation:
+ """(Required) Input relation for a Filter."""
@property
- def condition(self) -> pyspark.sql.connect.proto.expressions_pb2.Expression: ...
+ def condition(self) -> pyspark.sql.connect.proto.expressions_pb2.Expression:
+ """(Required) A Filter must have a condition expression."""
def __init__(
self,
*,
@@ -552,12 +574,19 @@ class Join(google.protobuf.message.Message):
JOIN_TYPE_FIELD_NUMBER: builtins.int
USING_COLUMNS_FIELD_NUMBER: builtins.int
@property
- def left(self) -> global___Relation: ...
+ def left(self) -> global___Relation:
+ """(Required) Left input relation for a Join."""
@property
- def right(self) -> global___Relation: ...
+ def right(self) -> global___Relation:
+ """(Required) Right input relation for a Join."""
@property
- def join_condition(self) -> pyspark.sql.connect.proto.expressions_pb2.Expression: ...
+ def join_condition(self) -> pyspark.sql.connect.proto.expressions_pb2.Expression:
+ """(Optional) The join condition. Could be unset when `using_columns` is utilized.
+
+ This field does not co-exist with using_columns.
+ """
join_type: global___Join.JoinType.ValueType
+ """(Required) The join type."""
@property
def using_columns(
self,
@@ -634,30 +663,57 @@ class SetOperation(google.protobuf.message.Message):
IS_ALL_FIELD_NUMBER: builtins.int
BY_NAME_FIELD_NUMBER: builtins.int
@property
- def left_input(self) -> global___Relation: ...
+ def left_input(self) -> global___Relation:
+ """(Required) Left input relation for a Set operation."""
@property
- def right_input(self) -> global___Relation: ...
+ def right_input(self) -> global___Relation:
+ """(Required) Right input relation for a Set operation."""
set_op_type: global___SetOperation.SetOpType.ValueType
+ """(Required) The Set operation type."""
is_all: builtins.bool
+ """(Optional) If to remove duplicate rows.
+
+ True to preserve all results.
+ False to remove duplicate rows.
+ """
by_name: builtins.bool
+ """(Optional) If to perform the Set operation based on name resolution.
+
+ Only UNION supports this option.
+ """
def __init__(
self,
*,
left_input: global___Relation | None = ...,
right_input: global___Relation | None = ...,
set_op_type: global___SetOperation.SetOpType.ValueType = ...,
- is_all: builtins.bool = ...,
- by_name: builtins.bool = ...,
+ is_all: builtins.bool | None = ...,
+ by_name: builtins.bool | None = ...,
) -> None: ...
def HasField(
self,
field_name: typing_extensions.Literal[
- "left_input", b"left_input", "right_input", b"right_input"
+ "_by_name",
+ b"_by_name",
+ "_is_all",
+ b"_is_all",
+ "by_name",
+ b"by_name",
+ "is_all",
+ b"is_all",
+ "left_input",
+ b"left_input",
+ "right_input",
+ b"right_input",
],
) -> builtins.bool: ...
def ClearField(
self,
field_name: typing_extensions.Literal[
+ "_by_name",
+ b"_by_name",
+ "_is_all",
+ b"_is_all",
"by_name",
b"by_name",
"is_all",
@@ -670,6 +726,14 @@ class SetOperation(google.protobuf.message.Message):
b"set_op_type",
],
) -> None: ...
+ @typing.overload
+ def WhichOneof(
+ self, oneof_group: typing_extensions.Literal["_by_name", b"_by_name"]
+ ) -> typing_extensions.Literal["by_name"] | None: ...
+ @typing.overload
+ def WhichOneof(
+ self, oneof_group: typing_extensions.Literal["_is_all", b"_is_all"]
+ ) -> typing_extensions.Literal["is_all"] | None: ...
global___SetOperation = SetOperation
@@ -681,8 +745,10 @@ class Limit(google.protobuf.message.Message):
INPUT_FIELD_NUMBER: builtins.int
LIMIT_FIELD_NUMBER: builtins.int
@property
- def input(self) -> global___Relation: ...
+ def input(self) -> global___Relation:
+ """(Required) Input relation for a Limit."""
limit: builtins.int
+ """(Required) the limit."""
def __init__(
self,
*,
@@ -708,8 +774,10 @@ class Offset(google.protobuf.message.Message):
INPUT_FIELD_NUMBER: builtins.int
OFFSET_FIELD_NUMBER: builtins.int
@property
- def input(self) -> global___Relation: ...
+ def input(self) -> global___Relation:
+ """(Required) Input relation for an Offset."""
offset: builtins.int
+ """(Required) the limit."""
def __init__(
self,
*,
@@ -734,7 +802,8 @@ class Aggregate(google.protobuf.message.Message):
GROUPING_EXPRESSIONS_FIELD_NUMBER: builtins.int
RESULT_EXPRESSIONS_FIELD_NUMBER: builtins.int
@property
- def input(self) -> global___Relation: ...
+ def input(self) -> global___Relation:
+ """(Required) Input relation for a Aggregate."""
@property
def grouping_expressions(
self,
@@ -849,30 +918,46 @@ class Sort(google.protobuf.message.Message):
SORT_FIELDS_FIELD_NUMBER: builtins.int
IS_GLOBAL_FIELD_NUMBER: builtins.int
@property
- def input(self) -> global___Relation: ...
+ def input(self) -> global___Relation:
+ """(Required) Input relation for a Sort."""
@property
def sort_fields(
self,
) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[
global___Sort.SortField
- ]: ...
+ ]:
+ """(Required) Sort fields."""
is_global: builtins.bool
+ """(Optional) if this is a global sort."""
def __init__(
self,
*,
input: global___Relation | None = ...,
sort_fields: collections.abc.Iterable[global___Sort.SortField] | None = ...,
- is_global: builtins.bool = ...,
+ is_global: builtins.bool | None = ...,
) -> None: ...
def HasField(
- self, field_name: typing_extensions.Literal["input", b"input"]
+ self,
+ field_name: typing_extensions.Literal[
+ "_is_global", b"_is_global", "input", b"input", "is_global", b"is_global"
+ ],
) -> builtins.bool: ...
def ClearField(
self,
field_name: typing_extensions.Literal[
- "input", b"input", "is_global", b"is_global", "sort_fields", b"sort_fields"
+ "_is_global",
+ b"_is_global",
+ "input",
+ b"input",
+ "is_global",
+ b"is_global",
+ "sort_fields",
+ b"sort_fields",
],
) -> None: ...
+ def WhichOneof(
+ self, oneof_group: typing_extensions.Literal["_is_global", b"_is_global"]
+ ) -> typing_extensions.Literal["is_global"] | None: ...
global___Sort = Sort
@@ -887,25 +972,44 @@ class Deduplicate(google.protobuf.message.Message):
COLUMN_NAMES_FIELD_NUMBER: builtins.int
ALL_COLUMNS_AS_KEYS_FIELD_NUMBER: builtins.int
@property
- def input(self) -> global___Relation: ...
+ def input(self) -> global___Relation:
+ """(Required) Input relation for a Deduplicate."""
@property
def column_names(
self,
- ) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]: ...
+ ) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]:
+ """(Optional) Deduplicate based on a list of column names.
+
+ This field does not co-use with `all_columns_as_keys`.
+ """
all_columns_as_keys: builtins.bool
+ """(Optional) Deduplicate based on all the columns of the input relation.
+
+ This field does not co-use with `column_names`.
+ """
def __init__(
self,
*,
input: global___Relation | None = ...,
column_names: collections.abc.Iterable[builtins.str] | None = ...,
- all_columns_as_keys: builtins.bool = ...,
+ all_columns_as_keys: builtins.bool | None = ...,
) -> None: ...
def HasField(
- self, field_name: typing_extensions.Literal["input", b"input"]
+ self,
+ field_name: typing_extensions.Literal[
+ "_all_columns_as_keys",
+ b"_all_columns_as_keys",
+ "all_columns_as_keys",
+ b"all_columns_as_keys",
+ "input",
+ b"input",
+ ],
) -> builtins.bool: ...
def ClearField(
self,
field_name: typing_extensions.Literal[
+ "_all_columns_as_keys",
+ b"_all_columns_as_keys",
"all_columns_as_keys",
b"all_columns_as_keys",
"column_names",
@@ -914,10 +1018,16 @@ class Deduplicate(google.protobuf.message.Message):
b"input",
],
) -> None: ...
+ def WhichOneof(
+ self,
+ oneof_group: typing_extensions.Literal["_all_columns_as_keys", b"_all_columns_as_keys"],
+ ) -> typing_extensions.Literal["all_columns_as_keys"] | None: ...
global___Deduplicate = Deduplicate
class LocalRelation(google.protobuf.message.Message):
+ """A relation that does not need to be qualified by name."""
+
DESCRIPTOR: google.protobuf.descriptor.Descriptor
ATTRIBUTES_FIELD_NUMBER: builtins.int
@@ -927,7 +1037,9 @@ class LocalRelation(google.protobuf.message.Message):
) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[
pyspark.sql.connect.proto.expressions_pb2.Expression.QualifiedAttribute
]:
- """TODO: support local data."""
+ """(Optional) A list qualified attributes.
+ TODO: support local data.
+ """
def __init__(
self,
*,
@@ -953,24 +1065,38 @@ class Sample(google.protobuf.message.Message):
WITH_REPLACEMENT_FIELD_NUMBER: builtins.int
SEED_FIELD_NUMBER: builtins.int
@property
- def input(self) -> global___Relation: ...
+ def input(self) -> global___Relation:
+ """(Required) Input relation for a Sample."""
lower_bound: builtins.float
+ """(Required) lower bound."""
upper_bound: builtins.float
+ """(Required) upper bound."""
with_replacement: builtins.bool
+ """(Optional) Whether to sample with replacement."""
seed: builtins.int
+ """(Optional) The random seed."""
def __init__(
self,
*,
input: global___Relation | None = ...,
lower_bound: builtins.float = ...,
upper_bound: builtins.float = ...,
- with_replacement: builtins.bool = ...,
+ with_replacement: builtins.bool | None = ...,
seed: builtins.int | None = ...,
) -> None: ...
def HasField(
self,
field_name: typing_extensions.Literal[
- "_seed", b"_seed", "input", b"input", "seed", b"seed"
+ "_seed",
+ b"_seed",
+ "_with_replacement",
+ b"_with_replacement",
+ "input",
+ b"input",
+ "seed",
+ b"seed",
+ "with_replacement",
+ b"with_replacement",
],
) -> builtins.bool: ...
def ClearField(
@@ -978,6 +1104,8 @@ class Sample(google.protobuf.message.Message):
field_name: typing_extensions.Literal[
"_seed",
b"_seed",
+ "_with_replacement",
+ b"_with_replacement",
"input",
b"input",
"lower_bound",
@@ -990,9 +1118,14 @@ class Sample(google.protobuf.message.Message):
b"with_replacement",
],
) -> None: ...
+ @typing.overload
def WhichOneof(
self, oneof_group: typing_extensions.Literal["_seed", b"_seed"]
) -> typing_extensions.Literal["seed"] | None: ...
+ @typing.overload
+ def WhichOneof(
+ self, oneof_group: typing_extensions.Literal["_with_replacement", b"_with_replacement"]
+ ) -> typing_extensions.Literal["with_replacement"] | None: ...
global___Sample = Sample
@@ -1006,11 +1139,11 @@ class Range(google.protobuf.message.Message):
STEP_FIELD_NUMBER: builtins.int
NUM_PARTITIONS_FIELD_NUMBER: builtins.int
start: builtins.int
- """Optional. Default value = 0"""
+ """(Optional) Default value = 0"""
end: builtins.int
- """Required."""
+ """(Required)"""
step: builtins.int
- """Required."""
+ """(Required)"""
num_partitions: builtins.int
"""Optional. Default value is assigned by 1) SQL conf "spark.sql.leafNodeDefaultParallelism" if
it is set, or 2) spark default parallelism.
@@ -1018,7 +1151,7 @@ class Range(google.protobuf.message.Message):
def __init__(
self,
*,
- start: builtins.int = ...,
+ start: builtins.int | None = ...,
end: builtins.int = ...,
step: builtins.int = ...,
num_partitions: builtins.int | None = ...,
@@ -1026,7 +1159,14 @@ class Range(google.protobuf.message.Message):
def HasField(
self,
field_name: typing_extensions.Literal[
- "_num_partitions", b"_num_partitions", "num_partitions", b"num_partitions"
+ "_num_partitions",
+ b"_num_partitions",
+ "_start",
+ b"_start",
+ "num_partitions",
+ b"num_partitions",
+ "start",
+ b"start",
],
) -> builtins.bool: ...
def ClearField(
@@ -1034,6 +1174,8 @@ class Range(google.protobuf.message.Message):
field_name: typing_extensions.Literal[
"_num_partitions",
b"_num_partitions",
+ "_start",
+ b"_start",
"end",
b"end",
"num_partitions",
@@ -1044,9 +1186,14 @@ class Range(google.protobuf.message.Message):
b"step",
],
) -> None: ...
+ @typing.overload
def WhichOneof(
self, oneof_group: typing_extensions.Literal["_num_partitions", b"_num_partitions"]
) -> typing_extensions.Literal["num_partitions"] | None: ...
+ @typing.overload
+ def WhichOneof(
+ self, oneof_group: typing_extensions.Literal["_start", b"_start"]
+ ) -> typing_extensions.Literal["start"] | None: ...
global___Range = Range
@@ -1060,14 +1207,14 @@ class SubqueryAlias(google.protobuf.message.Message):
QUALIFIER_FIELD_NUMBER: builtins.int
@property
def input(self) -> global___Relation:
- """Required. The input relation."""
+ """(Required) The input relation of SubqueryAlias."""
alias: builtins.str
- """Required. The alias."""
+ """(Required) The alias."""
@property
def qualifier(
self,
) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]:
- """Optional. Qualifier of the alias."""
+ """(Optional) Qualifier of the alias."""
def __init__(
self,
*,
@@ -1097,27 +1244,40 @@ class Repartition(google.protobuf.message.Message):
SHUFFLE_FIELD_NUMBER: builtins.int
@property
def input(self) -> global___Relation:
- """Required. The input relation."""
+ """(Required) The input relation of Repartition."""
num_partitions: builtins.int
- """Required. Must be positive."""
+ """(Required) Must be positive."""
shuffle: builtins.bool
- """Optional. Default value is false."""
+ """(Optional) Default value is false."""
def __init__(
self,
*,
input: global___Relation | None = ...,
num_partitions: builtins.int = ...,
- shuffle: builtins.bool = ...,
+ shuffle: builtins.bool | None = ...,
) -> None: ...
def HasField(
- self, field_name: typing_extensions.Literal["input", b"input"]
+ self,
+ field_name: typing_extensions.Literal[
+ "_shuffle", b"_shuffle", "input", b"input", "shuffle", b"shuffle"
+ ],
) -> builtins.bool: ...
def ClearField(
self,
field_name: typing_extensions.Literal[
- "input", b"input", "num_partitions", b"num_partitions", "shuffle", b"shuffle"
+ "_shuffle",
+ b"_shuffle",
+ "input",
+ b"input",
+ "num_partitions",
+ b"num_partitions",
+ "shuffle",
+ b"shuffle",
],
) -> None: ...
+ def WhichOneof(
+ self, oneof_group: typing_extensions.Literal["_shuffle", b"_shuffle"]
+ ) -> typing_extensions.Literal["shuffle"] | None: ...
global___Repartition = Repartition
@@ -1147,38 +1307,16 @@ class ShowString(google.protobuf.message.Message):
self,
*,
input: global___Relation | None = ...,
- numRows: builtins.int | None = ...,
- truncate: builtins.int | None = ...,
- vertical: builtins.bool | None = ...,
+ numRows: builtins.int = ...,
+ truncate: builtins.int = ...,
+ vertical: builtins.bool = ...,
) -> None: ...
def HasField(
- self,
- field_name: typing_extensions.Literal[
- "_numRows",
- b"_numRows",
- "_truncate",
- b"_truncate",
- "_vertical",
- b"_vertical",
- "input",
- b"input",
- "numRows",
- b"numRows",
- "truncate",
- b"truncate",
- "vertical",
- b"vertical",
- ],
+ self, field_name: typing_extensions.Literal["input", b"input"]
) -> builtins.bool: ...
def ClearField(
self,
field_name: typing_extensions.Literal[
- "_numRows",
- b"_numRows",
- "_truncate",
- b"_truncate",
- "_vertical",
- b"_vertical",
"input",
b"input",
"numRows",
@@ -1189,18 +1327,6 @@ class ShowString(google.protobuf.message.Message):
b"vertical",
],
) -> None: ...
- @typing.overload
- def WhichOneof(
- self, oneof_group: typing_extensions.Literal["_numRows", b"_numRows"]
- ) -> typing_extensions.Literal["numRows"] | None: ...
- @typing.overload
- def WhichOneof(
- self, oneof_group: typing_extensions.Literal["_truncate", b"_truncate"]
- ) -> typing_extensions.Literal["truncate"] | None: ...
- @typing.overload
- def WhichOneof(
- self, oneof_group: typing_extensions.Literal["_vertical", b"_vertical"]
- ) -> typing_extensions.Literal["vertical"] | None: ...
global___ShowString = ShowString
@@ -1359,12 +1485,12 @@ class RenameColumnsBySameLengthNames(google.protobuf.message.Message):
COLUMN_NAMES_FIELD_NUMBER: builtins.int
@property
def input(self) -> global___Relation:
- """Required. The input relation."""
+ """(Required) The input relation of RenameColumnsBySameLengthNames."""
@property
def column_names(
self,
) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]:
- """Required.
+ """(Required)
The number of columns of the input relation must be equal to the length
of this field. If this is not true, an exception will be returned.
@@ -1411,12 +1537,12 @@ class RenameColumnsByNameToNameMap(google.protobuf.message.Message):
RENAME_COLUMNS_MAP_FIELD_NUMBER: builtins.int
@property
def input(self) -> global___Relation:
- """Required. The input relation."""
+ """(Required) The input relation."""
@property
def rename_columns_map(
self,
) -> google.protobuf.internal.containers.ScalarMap[builtins.str, builtins.str]:
- """Required.
+ """(Required)
Renaming column names of input relation from A to B where A is the map key
and B is the map value. This is a no-op if schema doesn't contain any A. It
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org