You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Tarun Khaneja (Jira)" <ji...@apache.org> on 2019/09/20 06:52:00 UTC
[jira] [Created] (SPARK-29186) SubqueryAlias name value is null in
Spark 2.4 Logical plan.
Tarun Khaneja created SPARK-29186:
-------------------------------------
Summary: SubqueryAlias name value is null in Spark 2.4 Logical plan.
Key: SPARK-29186
URL: https://issues.apache.org/jira/browse/SPARK-29186
Project: Spark
Issue Type: Bug
Components: Spark Core
Affects Versions: 2.4.3
Environment: I have tried this on AWS Glue with Spark 2.4.3
and on windows 10 with 2.4.4
at both of them facing same issue
Reporter: Tarun Khaneja
Fix For: 2.2.1
I am writing a program to analyze sql query. So I am using Spark logical plan.I am writing a program to analyze sql query. So I am using Spark logical plan.
Below is the code which I am using
object QueryAnalyzer { val LOG = LoggerFactory.getLogger(this.getClass)
//Spark Conf val conf = new SparkConf().setMaster("local[2]").setAppName("LocalEdlExecutor")
//Spark Context val sc = new SparkContext(conf)
//sql Context val sqlContext = new SQLContext(sc)
//Spark Session val sparkSession = SparkSession .builder() .appName("Spark User Data") .config("spark.app.name", "LocalEdl") .getOrCreate()
def main(args: Array[String]) \{ var inputDfColumns = Map[String,List[String]]() val dfSession = sparkSession. read. format("csv"). option("header", EdlConstants.TRUE). option("inferschema", EdlConstants.TRUE). option("delimiter", ","). option("decoding", EdlConstants.UTF8). option("multiline", true) var oDF = dfSession. load("C:\\Users\\tarun.khaneja\\data\\order.csv") println("smaple data in oDF====>") oDF.show() var cusDF = dfSession. load("C:\\Users\\tarun.khaneja\\data\\customer.csv") println("smaple data in cusDF====>") cusDF.show() oDF.createOrReplaceTempView("orderTempView") cusDF.createOrReplaceTempView("customerTempView") //get input columns from all dataframe inputDfColumns += ("orderTempView"->oDF.columns.toList) inputDfColumns += ("customerTempView"->cusDF.columns.toList) val res = sqlContext.sql("""select OID, max(MID+CID) as MID_new,ROW_NUMBER() OVER ( ORDER BY CID) as rn from (select OID_1 as OID, CID_1 as CID, OID_1+CID_1 as MID from (select min(ot.OrderID) as OID_1, ct.CustomerID as CID_1 from orderTempView as ot inner join customerTempView as ct on ot.CustomerID = ct.CustomerID group by CID_1)) group by OID,CID""") println(res.show(false)) val analyzedPlan = res.queryExecution.analyzed println(analyzedPlan.prettyJson) }
Now problem is, with Spark 2.2.1, I am getting below json. where I have SubqueryAlias which provide important information of alias name for table which we used in query, as shown below.
... ... ... [ \{ "class" : "org.apache.spark.sql.catalyst.expressions.AttributeReference", "num-children" : 0, "name" : "OrderDate", "dataType" : "string", "nullable" : true, "metadata" : { }, "exprId" : \{ "product-class" : "org.apache.spark.sql.catalyst.expressions.ExprId", "id" : 2, "jvmId" : "acefe6e6-e469-4c9a-8a36-5694f054dc0a" }, "isGenerated" : false } ] ] }, \{ "class" : "org.apache.spark.sql.catalyst.plans.logical.**SubqueryAlias**", "num-children" : 1, "alias" : "ct", "child" : 0 }, \{ "class" : "org.apache.spark.sql.catalyst.plans.logical.**SubqueryAlias**", "num-children" : 1, "alias" : "customertempview", "child" : 0 }, { "class" : "org.apache.spark.sql.execution.datasources.LogicalRelation", "num-children" : 0, "relation" : null, "output" :
... ... ...
But with Spark 2.4, I am getting SubqueryAlias name as null. As shown below in json.
... ... \{ "class": "org.apache.spark.sql.catalyst.expressions.AttributeReference", "num-children": 0, "name": "CustomerID", "dataType": "integer", "nullable": true, "metadata": {}, "exprId": \{ "product-class": "org.apache.spark.sql.catalyst.expressions.ExprId", "id": 19, "jvmId": "3b0dde0c-0b8f-4c63-a3ed-4dba526f8331" }, "qualifier": "[ct]" }] }, \{ "class": "org.apache.spark.sql.catalyst.plans.logical.**SubqueryAlias**", "num-children": 1, "name": null, "child": 0 }, \{ "class": "org.apache.spark.sql.catalyst.plans.logical.**SubqueryAlias**", "num-children": 1, "name": null, "child": 0 }, { "class": "org.apache.spark.sql.execution.datasources.LogicalRelation", "num-children": 0, "relation": null, "output":
... ...
So, I am not sure if it is bug in Spark 2.4 because of which I am getting name as null in SubquerAlias.Or if it is not bug then how can I get relation between alias name and real table name.
Any idea on this?
--
This message was sent by Atlassian Jira
(v8.3.4#803005)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org