You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2023/09/12 05:59:59 UTC
[spark] branch master updated: [SPARK-45124][CONNET] Do not use local user ID for Local Relations
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 47d801e5e9d [SPARK-45124][CONNET] Do not use local user ID for Local Relations
47d801e5e9d is described below
commit 47d801e5e9ded3fb50d274a720ee7874e0b37cc3
Author: Hyukjin Kwon <gu...@apache.org>
AuthorDate: Tue Sep 12 14:59:44 2023 +0900
[SPARK-45124][CONNET] Do not use local user ID for Local Relations
### What changes were proposed in this pull request?
This PR removes the use of `userId` and `sessionId` in `CachedLocalRelation` messages and subsequently make `SparkConnectPlanner` use the `userId`/`sessionId` of the active session rather than the user-provided information.
### Why are the changes needed?
Allowing a fetch of a local relation using user-provided information is a potential security risk since this allows users to fetch arbitrary local relations.
### Does this PR introduce _any_ user-facing change?
Virtually no. It will ignore the session id or user id that users set (but instead use internal ones that users cannot manipulate).
### How was this patch tested?
Manually.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #42880 from HyukjinKwon/no-local-user.
Authored-by: Hyukjin Kwon <gu...@apache.org>
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
.../scala/org/apache/spark/sql/SparkSession.scala | 2 -
.../main/protobuf/spark/connect/relations.proto | 10 +-
.../sql/connect/planner/SparkConnectPlanner.scala | 2 +-
python/pyspark/sql/connect/plan.py | 3 -
python/pyspark/sql/connect/proto/relations_pb2.py | 160 ++++++++++-----------
python/pyspark/sql/connect/proto/relations_pb2.pyi | 15 +-
6 files changed, 87 insertions(+), 105 deletions(-)
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 7882ea64013..7bd8fa59aea 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -134,8 +134,6 @@ class SparkSession private[sql] (
} else {
val hash = client.cacheLocalRelation(arrowData, encoder.schema.json)
builder.getCachedLocalRelationBuilder
- .setUserId(client.userId)
- .setSessionId(client.sessionId)
.setHash(hash)
}
} else {
diff --git a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto
index 8001b3cbcfa..f7f1315ede0 100644
--- a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto
+++ b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto
@@ -400,11 +400,11 @@ message LocalRelation {
// A local relation that has been cached already.
message CachedLocalRelation {
- // (Required) An identifier of the user which created the local relation
- string userId = 1;
-
- // (Required) An identifier of the Spark SQL session in which the user created the local relation.
- string sessionId = 2;
+ // `userId` and `sessionId` fields are deleted since the server must always use the active
+ // session/user rather than arbitrary values provided by the client. It is never valid to access
+ // a local relation from a different session/user.
+ reserved 1, 2;
+ reserved "userId", "sessionId";
// (Required) A sha-256 hash of the serialized local relation in proto, see LocalRelation.
string hash = 3;
diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
index 1a63c9fc27c..b8ab5539b30 100644
--- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
+++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
@@ -970,7 +970,7 @@ class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging {
private def transformCachedLocalRelation(rel: proto.CachedLocalRelation): LogicalPlan = {
val blockManager = session.sparkContext.env.blockManager
- val blockId = CacheId(rel.getUserId, rel.getSessionId, rel.getHash)
+ val blockId = CacheId(sessionHolder.userId, sessionHolder.sessionId, rel.getHash)
val bytes = blockManager.getLocalBytes(blockId)
bytes
.map { blockData =>
diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py
index 5e9b4e53dbf..f641cb4b2fe 100644
--- a/python/pyspark/sql/connect/plan.py
+++ b/python/pyspark/sql/connect/plan.py
@@ -398,9 +398,6 @@ class CachedLocalRelation(LogicalPlan):
plan = self._create_proto_relation()
clr = plan.cached_local_relation
- if session._user_id:
- clr.userId = session._user_id
- clr.sessionId = session._session_id
clr.hash = self._hash
return plan
diff --git a/python/pyspark/sql/connect/proto/relations_pb2.py b/python/pyspark/sql/connect/proto/relations_pb2.py
index 3a0a7ff71fd..3f7e5794937 100644
--- a/python/pyspark/sql/connect/proto/relations_pb2.py
+++ b/python/pyspark/sql/connect/proto/relations_pb2.py
@@ -35,7 +35,7 @@ from pyspark.sql.connect.proto import catalog_pb2 as spark_dot_connect_dot_catal
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
- b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1fspark/connect/expressions.proto\x1a\x19spark/connect/types.proto\x1a\x1bspark/connect/catalog.proto"\xe1\x18\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01 \x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02 \x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03 \x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66il [...]
+ b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1fspark/connect/expressions.proto\x1a\x19spark/connect/types.proto\x1a\x1bspark/connect/catalog.proto"\xe1\x18\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01 \x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02 \x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03 \x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66il [...]
)
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
@@ -111,85 +111,85 @@ if _descriptor._USE_C_DESCRIPTORS == False:
_LOCALRELATION._serialized_start = 7090
_LOCALRELATION._serialized_end = 7179
_CACHEDLOCALRELATION._serialized_start = 7181
- _CACHEDLOCALRELATION._serialized_end = 7276
- _CACHEDREMOTERELATION._serialized_start = 7278
- _CACHEDREMOTERELATION._serialized_end = 7333
- _SAMPLE._serialized_start = 7336
- _SAMPLE._serialized_end = 7609
- _RANGE._serialized_start = 7612
- _RANGE._serialized_end = 7757
- _SUBQUERYALIAS._serialized_start = 7759
- _SUBQUERYALIAS._serialized_end = 7873
- _REPARTITION._serialized_start = 7876
- _REPARTITION._serialized_end = 8018
- _SHOWSTRING._serialized_start = 8021
- _SHOWSTRING._serialized_end = 8163
- _HTMLSTRING._serialized_start = 8165
- _HTMLSTRING._serialized_end = 8279
- _STATSUMMARY._serialized_start = 8281
- _STATSUMMARY._serialized_end = 8373
- _STATDESCRIBE._serialized_start = 8375
- _STATDESCRIBE._serialized_end = 8456
- _STATCROSSTAB._serialized_start = 8458
- _STATCROSSTAB._serialized_end = 8559
- _STATCOV._serialized_start = 8561
- _STATCOV._serialized_end = 8657
- _STATCORR._serialized_start = 8660
- _STATCORR._serialized_end = 8797
- _STATAPPROXQUANTILE._serialized_start = 8800
- _STATAPPROXQUANTILE._serialized_end = 8964
- _STATFREQITEMS._serialized_start = 8966
- _STATFREQITEMS._serialized_end = 9091
- _STATSAMPLEBY._serialized_start = 9094
- _STATSAMPLEBY._serialized_end = 9403
- _STATSAMPLEBY_FRACTION._serialized_start = 9295
- _STATSAMPLEBY_FRACTION._serialized_end = 9394
- _NAFILL._serialized_start = 9406
- _NAFILL._serialized_end = 9540
- _NADROP._serialized_start = 9543
- _NADROP._serialized_end = 9677
- _NAREPLACE._serialized_start = 9680
- _NAREPLACE._serialized_end = 9976
- _NAREPLACE_REPLACEMENT._serialized_start = 9835
- _NAREPLACE_REPLACEMENT._serialized_end = 9976
- _TODF._serialized_start = 9978
- _TODF._serialized_end = 10066
- _WITHCOLUMNSRENAMED._serialized_start = 10069
- _WITHCOLUMNSRENAMED._serialized_end = 10308
- _WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY._serialized_start = 10241
- _WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY._serialized_end = 10308
- _WITHCOLUMNS._serialized_start = 10310
- _WITHCOLUMNS._serialized_end = 10429
- _WITHWATERMARK._serialized_start = 10432
- _WITHWATERMARK._serialized_end = 10566
- _HINT._serialized_start = 10569
- _HINT._serialized_end = 10701
- _UNPIVOT._serialized_start = 10704
- _UNPIVOT._serialized_end = 11031
- _UNPIVOT_VALUES._serialized_start = 10961
- _UNPIVOT_VALUES._serialized_end = 11020
- _TOSCHEMA._serialized_start = 11033
- _TOSCHEMA._serialized_end = 11139
- _REPARTITIONBYEXPRESSION._serialized_start = 11142
- _REPARTITIONBYEXPRESSION._serialized_end = 11345
- _MAPPARTITIONS._serialized_start = 11348
- _MAPPARTITIONS._serialized_end = 11529
- _GROUPMAP._serialized_start = 11532
- _GROUPMAP._serialized_end = 12167
- _COGROUPMAP._serialized_start = 12170
- _COGROUPMAP._serialized_end = 12696
- _APPLYINPANDASWITHSTATE._serialized_start = 12699
- _APPLYINPANDASWITHSTATE._serialized_end = 13056
- _COMMONINLINEUSERDEFINEDTABLEFUNCTION._serialized_start = 13059
- _COMMONINLINEUSERDEFINEDTABLEFUNCTION._serialized_end = 13303
- _PYTHONUDTF._serialized_start = 13306
- _PYTHONUDTF._serialized_end = 13483
- _COLLECTMETRICS._serialized_start = 13486
- _COLLECTMETRICS._serialized_end = 13622
- _PARSE._serialized_start = 13625
- _PARSE._serialized_end = 14013
+ _CACHEDLOCALRELATION._serialized_end = 7253
+ _CACHEDREMOTERELATION._serialized_start = 7255
+ _CACHEDREMOTERELATION._serialized_end = 7310
+ _SAMPLE._serialized_start = 7313
+ _SAMPLE._serialized_end = 7586
+ _RANGE._serialized_start = 7589
+ _RANGE._serialized_end = 7734
+ _SUBQUERYALIAS._serialized_start = 7736
+ _SUBQUERYALIAS._serialized_end = 7850
+ _REPARTITION._serialized_start = 7853
+ _REPARTITION._serialized_end = 7995
+ _SHOWSTRING._serialized_start = 7998
+ _SHOWSTRING._serialized_end = 8140
+ _HTMLSTRING._serialized_start = 8142
+ _HTMLSTRING._serialized_end = 8256
+ _STATSUMMARY._serialized_start = 8258
+ _STATSUMMARY._serialized_end = 8350
+ _STATDESCRIBE._serialized_start = 8352
+ _STATDESCRIBE._serialized_end = 8433
+ _STATCROSSTAB._serialized_start = 8435
+ _STATCROSSTAB._serialized_end = 8536
+ _STATCOV._serialized_start = 8538
+ _STATCOV._serialized_end = 8634
+ _STATCORR._serialized_start = 8637
+ _STATCORR._serialized_end = 8774
+ _STATAPPROXQUANTILE._serialized_start = 8777
+ _STATAPPROXQUANTILE._serialized_end = 8941
+ _STATFREQITEMS._serialized_start = 8943
+ _STATFREQITEMS._serialized_end = 9068
+ _STATSAMPLEBY._serialized_start = 9071
+ _STATSAMPLEBY._serialized_end = 9380
+ _STATSAMPLEBY_FRACTION._serialized_start = 9272
+ _STATSAMPLEBY_FRACTION._serialized_end = 9371
+ _NAFILL._serialized_start = 9383
+ _NAFILL._serialized_end = 9517
+ _NADROP._serialized_start = 9520
+ _NADROP._serialized_end = 9654
+ _NAREPLACE._serialized_start = 9657
+ _NAREPLACE._serialized_end = 9953
+ _NAREPLACE_REPLACEMENT._serialized_start = 9812
+ _NAREPLACE_REPLACEMENT._serialized_end = 9953
+ _TODF._serialized_start = 9955
+ _TODF._serialized_end = 10043
+ _WITHCOLUMNSRENAMED._serialized_start = 10046
+ _WITHCOLUMNSRENAMED._serialized_end = 10285
+ _WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY._serialized_start = 10218
+ _WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY._serialized_end = 10285
+ _WITHCOLUMNS._serialized_start = 10287
+ _WITHCOLUMNS._serialized_end = 10406
+ _WITHWATERMARK._serialized_start = 10409
+ _WITHWATERMARK._serialized_end = 10543
+ _HINT._serialized_start = 10546
+ _HINT._serialized_end = 10678
+ _UNPIVOT._serialized_start = 10681
+ _UNPIVOT._serialized_end = 11008
+ _UNPIVOT_VALUES._serialized_start = 10938
+ _UNPIVOT_VALUES._serialized_end = 10997
+ _TOSCHEMA._serialized_start = 11010
+ _TOSCHEMA._serialized_end = 11116
+ _REPARTITIONBYEXPRESSION._serialized_start = 11119
+ _REPARTITIONBYEXPRESSION._serialized_end = 11322
+ _MAPPARTITIONS._serialized_start = 11325
+ _MAPPARTITIONS._serialized_end = 11506
+ _GROUPMAP._serialized_start = 11509
+ _GROUPMAP._serialized_end = 12144
+ _COGROUPMAP._serialized_start = 12147
+ _COGROUPMAP._serialized_end = 12673
+ _APPLYINPANDASWITHSTATE._serialized_start = 12676
+ _APPLYINPANDASWITHSTATE._serialized_end = 13033
+ _COMMONINLINEUSERDEFINEDTABLEFUNCTION._serialized_start = 13036
+ _COMMONINLINEUSERDEFINEDTABLEFUNCTION._serialized_end = 13280
+ _PYTHONUDTF._serialized_start = 13283
+ _PYTHONUDTF._serialized_end = 13460
+ _COLLECTMETRICS._serialized_start = 13463
+ _COLLECTMETRICS._serialized_end = 13599
+ _PARSE._serialized_start = 13602
+ _PARSE._serialized_end = 13990
_PARSE_OPTIONSENTRY._serialized_start = 3987
_PARSE_OPTIONSENTRY._serialized_end = 4045
- _PARSE_PARSEFORMAT._serialized_start = 13914
- _PARSE_PARSEFORMAT._serialized_end = 14002
+ _PARSE_PARSEFORMAT._serialized_start = 13891
+ _PARSE_PARSEFORMAT._serialized_end = 13979
# @@protoc_insertion_point(module_scope)
diff --git a/python/pyspark/sql/connect/proto/relations_pb2.pyi b/python/pyspark/sql/connect/proto/relations_pb2.pyi
index 9cadd4acc52..007b92ef5f4 100644
--- a/python/pyspark/sql/connect/proto/relations_pb2.pyi
+++ b/python/pyspark/sql/connect/proto/relations_pb2.pyi
@@ -1647,28 +1647,15 @@ class CachedLocalRelation(google.protobuf.message.Message):
DESCRIPTOR: google.protobuf.descriptor.Descriptor
- USERID_FIELD_NUMBER: builtins.int
- SESSIONID_FIELD_NUMBER: builtins.int
HASH_FIELD_NUMBER: builtins.int
- userId: builtins.str
- """(Required) An identifier of the user which created the local relation"""
- sessionId: builtins.str
- """(Required) An identifier of the Spark SQL session in which the user created the local relation."""
hash: builtins.str
"""(Required) A sha-256 hash of the serialized local relation in proto, see LocalRelation."""
def __init__(
self,
*,
- userId: builtins.str = ...,
- sessionId: builtins.str = ...,
hash: builtins.str = ...,
) -> None: ...
- def ClearField(
- self,
- field_name: typing_extensions.Literal[
- "hash", b"hash", "sessionId", b"sessionId", "userId", b"userId"
- ],
- ) -> None: ...
+ def ClearField(self, field_name: typing_extensions.Literal["hash", b"hash"]) -> None: ...
global___CachedLocalRelation = CachedLocalRelation
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org