You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2023/09/12 05:59:59 UTC

[spark] branch master updated: [SPARK-45124][CONNET] Do not use local user ID for Local Relations

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 47d801e5e9d [SPARK-45124][CONNET] Do not use local user ID for Local Relations
47d801e5e9d is described below

commit 47d801e5e9ded3fb50d274a720ee7874e0b37cc3
Author: Hyukjin Kwon <gu...@apache.org>
AuthorDate: Tue Sep 12 14:59:44 2023 +0900

    [SPARK-45124][CONNET] Do not use local user ID for Local Relations
    
    ### What changes were proposed in this pull request?
    
    This PR removes the use of `userId` and `sessionId` in `CachedLocalRelation` messages and subsequently make `SparkConnectPlanner` use the `userId`/`sessionId` of the active session rather than the user-provided information.
    
    ### Why are the changes needed?
    
    Allowing a fetch of a local relation using user-provided information is a potential security risk since this allows users to fetch arbitrary local relations.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Virtually no. It will ignore the session id or user id that users set (but instead use internal ones that users cannot manipulate).
    
    ### How was this patch tested?
    
    Manually.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #42880 from HyukjinKwon/no-local-user.
    
    Authored-by: Hyukjin Kwon <gu...@apache.org>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 .../scala/org/apache/spark/sql/SparkSession.scala  |   2 -
 .../main/protobuf/spark/connect/relations.proto    |  10 +-
 .../sql/connect/planner/SparkConnectPlanner.scala  |   2 +-
 python/pyspark/sql/connect/plan.py                 |   3 -
 python/pyspark/sql/connect/proto/relations_pb2.py  | 160 ++++++++++-----------
 python/pyspark/sql/connect/proto/relations_pb2.pyi |  15 +-
 6 files changed, 87 insertions(+), 105 deletions(-)

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 7882ea64013..7bd8fa59aea 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -134,8 +134,6 @@ class SparkSession private[sql] (
         } else {
           val hash = client.cacheLocalRelation(arrowData, encoder.schema.json)
           builder.getCachedLocalRelationBuilder
-            .setUserId(client.userId)
-            .setSessionId(client.sessionId)
             .setHash(hash)
         }
       } else {
diff --git a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto
index 8001b3cbcfa..f7f1315ede0 100644
--- a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto
+++ b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto
@@ -400,11 +400,11 @@ message LocalRelation {
 
 // A local relation that has been cached already.
 message CachedLocalRelation {
-  // (Required) An identifier of the user which created the local relation
-  string userId = 1;
-
-  // (Required) An identifier of the Spark SQL session in which the user created the local relation.
-  string sessionId = 2;
+  // `userId` and `sessionId` fields are deleted since the server must always use the active
+  // session/user rather than arbitrary values provided by the client. It is never valid to access
+  // a local relation from a different session/user.
+  reserved 1, 2;
+  reserved "userId", "sessionId";
 
   // (Required) A sha-256 hash of the serialized local relation in proto, see LocalRelation.
   string hash = 3;
diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
index 1a63c9fc27c..b8ab5539b30 100644
--- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
+++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
@@ -970,7 +970,7 @@ class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging {
 
   private def transformCachedLocalRelation(rel: proto.CachedLocalRelation): LogicalPlan = {
     val blockManager = session.sparkContext.env.blockManager
-    val blockId = CacheId(rel.getUserId, rel.getSessionId, rel.getHash)
+    val blockId = CacheId(sessionHolder.userId, sessionHolder.sessionId, rel.getHash)
     val bytes = blockManager.getLocalBytes(blockId)
     bytes
       .map { blockData =>
diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py
index 5e9b4e53dbf..f641cb4b2fe 100644
--- a/python/pyspark/sql/connect/plan.py
+++ b/python/pyspark/sql/connect/plan.py
@@ -398,9 +398,6 @@ class CachedLocalRelation(LogicalPlan):
         plan = self._create_proto_relation()
         clr = plan.cached_local_relation
 
-        if session._user_id:
-            clr.userId = session._user_id
-        clr.sessionId = session._session_id
         clr.hash = self._hash
 
         return plan
diff --git a/python/pyspark/sql/connect/proto/relations_pb2.py b/python/pyspark/sql/connect/proto/relations_pb2.py
index 3a0a7ff71fd..3f7e5794937 100644
--- a/python/pyspark/sql/connect/proto/relations_pb2.py
+++ b/python/pyspark/sql/connect/proto/relations_pb2.py
@@ -35,7 +35,7 @@ from pyspark.sql.connect.proto import catalog_pb2 as spark_dot_connect_dot_catal
 
 
 DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
-    b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1fspark/connect/expressions.proto\x1a\x19spark/connect/types.proto\x1a\x1bspark/connect/catalog.proto"\xe1\x18\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01 \x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02 \x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03 \x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66il [...]
+    b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1fspark/connect/expressions.proto\x1a\x19spark/connect/types.proto\x1a\x1bspark/connect/catalog.proto"\xe1\x18\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01 \x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02 \x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03 \x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66il [...]
 )
 
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
@@ -111,85 +111,85 @@ if _descriptor._USE_C_DESCRIPTORS == False:
     _LOCALRELATION._serialized_start = 7090
     _LOCALRELATION._serialized_end = 7179
     _CACHEDLOCALRELATION._serialized_start = 7181
-    _CACHEDLOCALRELATION._serialized_end = 7276
-    _CACHEDREMOTERELATION._serialized_start = 7278
-    _CACHEDREMOTERELATION._serialized_end = 7333
-    _SAMPLE._serialized_start = 7336
-    _SAMPLE._serialized_end = 7609
-    _RANGE._serialized_start = 7612
-    _RANGE._serialized_end = 7757
-    _SUBQUERYALIAS._serialized_start = 7759
-    _SUBQUERYALIAS._serialized_end = 7873
-    _REPARTITION._serialized_start = 7876
-    _REPARTITION._serialized_end = 8018
-    _SHOWSTRING._serialized_start = 8021
-    _SHOWSTRING._serialized_end = 8163
-    _HTMLSTRING._serialized_start = 8165
-    _HTMLSTRING._serialized_end = 8279
-    _STATSUMMARY._serialized_start = 8281
-    _STATSUMMARY._serialized_end = 8373
-    _STATDESCRIBE._serialized_start = 8375
-    _STATDESCRIBE._serialized_end = 8456
-    _STATCROSSTAB._serialized_start = 8458
-    _STATCROSSTAB._serialized_end = 8559
-    _STATCOV._serialized_start = 8561
-    _STATCOV._serialized_end = 8657
-    _STATCORR._serialized_start = 8660
-    _STATCORR._serialized_end = 8797
-    _STATAPPROXQUANTILE._serialized_start = 8800
-    _STATAPPROXQUANTILE._serialized_end = 8964
-    _STATFREQITEMS._serialized_start = 8966
-    _STATFREQITEMS._serialized_end = 9091
-    _STATSAMPLEBY._serialized_start = 9094
-    _STATSAMPLEBY._serialized_end = 9403
-    _STATSAMPLEBY_FRACTION._serialized_start = 9295
-    _STATSAMPLEBY_FRACTION._serialized_end = 9394
-    _NAFILL._serialized_start = 9406
-    _NAFILL._serialized_end = 9540
-    _NADROP._serialized_start = 9543
-    _NADROP._serialized_end = 9677
-    _NAREPLACE._serialized_start = 9680
-    _NAREPLACE._serialized_end = 9976
-    _NAREPLACE_REPLACEMENT._serialized_start = 9835
-    _NAREPLACE_REPLACEMENT._serialized_end = 9976
-    _TODF._serialized_start = 9978
-    _TODF._serialized_end = 10066
-    _WITHCOLUMNSRENAMED._serialized_start = 10069
-    _WITHCOLUMNSRENAMED._serialized_end = 10308
-    _WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY._serialized_start = 10241
-    _WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY._serialized_end = 10308
-    _WITHCOLUMNS._serialized_start = 10310
-    _WITHCOLUMNS._serialized_end = 10429
-    _WITHWATERMARK._serialized_start = 10432
-    _WITHWATERMARK._serialized_end = 10566
-    _HINT._serialized_start = 10569
-    _HINT._serialized_end = 10701
-    _UNPIVOT._serialized_start = 10704
-    _UNPIVOT._serialized_end = 11031
-    _UNPIVOT_VALUES._serialized_start = 10961
-    _UNPIVOT_VALUES._serialized_end = 11020
-    _TOSCHEMA._serialized_start = 11033
-    _TOSCHEMA._serialized_end = 11139
-    _REPARTITIONBYEXPRESSION._serialized_start = 11142
-    _REPARTITIONBYEXPRESSION._serialized_end = 11345
-    _MAPPARTITIONS._serialized_start = 11348
-    _MAPPARTITIONS._serialized_end = 11529
-    _GROUPMAP._serialized_start = 11532
-    _GROUPMAP._serialized_end = 12167
-    _COGROUPMAP._serialized_start = 12170
-    _COGROUPMAP._serialized_end = 12696
-    _APPLYINPANDASWITHSTATE._serialized_start = 12699
-    _APPLYINPANDASWITHSTATE._serialized_end = 13056
-    _COMMONINLINEUSERDEFINEDTABLEFUNCTION._serialized_start = 13059
-    _COMMONINLINEUSERDEFINEDTABLEFUNCTION._serialized_end = 13303
-    _PYTHONUDTF._serialized_start = 13306
-    _PYTHONUDTF._serialized_end = 13483
-    _COLLECTMETRICS._serialized_start = 13486
-    _COLLECTMETRICS._serialized_end = 13622
-    _PARSE._serialized_start = 13625
-    _PARSE._serialized_end = 14013
+    _CACHEDLOCALRELATION._serialized_end = 7253
+    _CACHEDREMOTERELATION._serialized_start = 7255
+    _CACHEDREMOTERELATION._serialized_end = 7310
+    _SAMPLE._serialized_start = 7313
+    _SAMPLE._serialized_end = 7586
+    _RANGE._serialized_start = 7589
+    _RANGE._serialized_end = 7734
+    _SUBQUERYALIAS._serialized_start = 7736
+    _SUBQUERYALIAS._serialized_end = 7850
+    _REPARTITION._serialized_start = 7853
+    _REPARTITION._serialized_end = 7995
+    _SHOWSTRING._serialized_start = 7998
+    _SHOWSTRING._serialized_end = 8140
+    _HTMLSTRING._serialized_start = 8142
+    _HTMLSTRING._serialized_end = 8256
+    _STATSUMMARY._serialized_start = 8258
+    _STATSUMMARY._serialized_end = 8350
+    _STATDESCRIBE._serialized_start = 8352
+    _STATDESCRIBE._serialized_end = 8433
+    _STATCROSSTAB._serialized_start = 8435
+    _STATCROSSTAB._serialized_end = 8536
+    _STATCOV._serialized_start = 8538
+    _STATCOV._serialized_end = 8634
+    _STATCORR._serialized_start = 8637
+    _STATCORR._serialized_end = 8774
+    _STATAPPROXQUANTILE._serialized_start = 8777
+    _STATAPPROXQUANTILE._serialized_end = 8941
+    _STATFREQITEMS._serialized_start = 8943
+    _STATFREQITEMS._serialized_end = 9068
+    _STATSAMPLEBY._serialized_start = 9071
+    _STATSAMPLEBY._serialized_end = 9380
+    _STATSAMPLEBY_FRACTION._serialized_start = 9272
+    _STATSAMPLEBY_FRACTION._serialized_end = 9371
+    _NAFILL._serialized_start = 9383
+    _NAFILL._serialized_end = 9517
+    _NADROP._serialized_start = 9520
+    _NADROP._serialized_end = 9654
+    _NAREPLACE._serialized_start = 9657
+    _NAREPLACE._serialized_end = 9953
+    _NAREPLACE_REPLACEMENT._serialized_start = 9812
+    _NAREPLACE_REPLACEMENT._serialized_end = 9953
+    _TODF._serialized_start = 9955
+    _TODF._serialized_end = 10043
+    _WITHCOLUMNSRENAMED._serialized_start = 10046
+    _WITHCOLUMNSRENAMED._serialized_end = 10285
+    _WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY._serialized_start = 10218
+    _WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY._serialized_end = 10285
+    _WITHCOLUMNS._serialized_start = 10287
+    _WITHCOLUMNS._serialized_end = 10406
+    _WITHWATERMARK._serialized_start = 10409
+    _WITHWATERMARK._serialized_end = 10543
+    _HINT._serialized_start = 10546
+    _HINT._serialized_end = 10678
+    _UNPIVOT._serialized_start = 10681
+    _UNPIVOT._serialized_end = 11008
+    _UNPIVOT_VALUES._serialized_start = 10938
+    _UNPIVOT_VALUES._serialized_end = 10997
+    _TOSCHEMA._serialized_start = 11010
+    _TOSCHEMA._serialized_end = 11116
+    _REPARTITIONBYEXPRESSION._serialized_start = 11119
+    _REPARTITIONBYEXPRESSION._serialized_end = 11322
+    _MAPPARTITIONS._serialized_start = 11325
+    _MAPPARTITIONS._serialized_end = 11506
+    _GROUPMAP._serialized_start = 11509
+    _GROUPMAP._serialized_end = 12144
+    _COGROUPMAP._serialized_start = 12147
+    _COGROUPMAP._serialized_end = 12673
+    _APPLYINPANDASWITHSTATE._serialized_start = 12676
+    _APPLYINPANDASWITHSTATE._serialized_end = 13033
+    _COMMONINLINEUSERDEFINEDTABLEFUNCTION._serialized_start = 13036
+    _COMMONINLINEUSERDEFINEDTABLEFUNCTION._serialized_end = 13280
+    _PYTHONUDTF._serialized_start = 13283
+    _PYTHONUDTF._serialized_end = 13460
+    _COLLECTMETRICS._serialized_start = 13463
+    _COLLECTMETRICS._serialized_end = 13599
+    _PARSE._serialized_start = 13602
+    _PARSE._serialized_end = 13990
     _PARSE_OPTIONSENTRY._serialized_start = 3987
     _PARSE_OPTIONSENTRY._serialized_end = 4045
-    _PARSE_PARSEFORMAT._serialized_start = 13914
-    _PARSE_PARSEFORMAT._serialized_end = 14002
+    _PARSE_PARSEFORMAT._serialized_start = 13891
+    _PARSE_PARSEFORMAT._serialized_end = 13979
 # @@protoc_insertion_point(module_scope)
diff --git a/python/pyspark/sql/connect/proto/relations_pb2.pyi b/python/pyspark/sql/connect/proto/relations_pb2.pyi
index 9cadd4acc52..007b92ef5f4 100644
--- a/python/pyspark/sql/connect/proto/relations_pb2.pyi
+++ b/python/pyspark/sql/connect/proto/relations_pb2.pyi
@@ -1647,28 +1647,15 @@ class CachedLocalRelation(google.protobuf.message.Message):
 
     DESCRIPTOR: google.protobuf.descriptor.Descriptor
 
-    USERID_FIELD_NUMBER: builtins.int
-    SESSIONID_FIELD_NUMBER: builtins.int
     HASH_FIELD_NUMBER: builtins.int
-    userId: builtins.str
-    """(Required) An identifier of the user which created the local relation"""
-    sessionId: builtins.str
-    """(Required) An identifier of the Spark SQL session in which the user created the local relation."""
     hash: builtins.str
     """(Required) A sha-256 hash of the serialized local relation in proto, see LocalRelation."""
     def __init__(
         self,
         *,
-        userId: builtins.str = ...,
-        sessionId: builtins.str = ...,
         hash: builtins.str = ...,
     ) -> None: ...
-    def ClearField(
-        self,
-        field_name: typing_extensions.Literal[
-            "hash", b"hash", "sessionId", b"sessionId", "userId", b"userId"
-        ],
-    ) -> None: ...
+    def ClearField(self, field_name: typing_extensions.Literal["hash", b"hash"]) -> None: ...
 
 global___CachedLocalRelation = CachedLocalRelation
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org