You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Shawn Zhang (JIRA)" <ji...@apache.org> on 2016/10/19 10:02:58 UTC
[jira] [Closed] (SPARK-18006) When union, spark SQL didn't complain about schema mismatch

     [ https://issues.apache.org/jira/browse/SPARK-18006?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Shawn Zhang closed SPARK-18006.
-------------------------------
    Resolution: Not A Problem

> When union, spark SQL didn't complain about schema mismatch
> -----------------------------------------------------------
>
>                 Key: SPARK-18006
>                 URL: https://issues.apache.org/jira/browse/SPARK-18006
>             Project: Spark
>          Issue Type: Bug
>          Components: Java API
>    Affects Versions: 2.0.1
>            Reporter: Shawn Zhang
>            Priority: Minor
>
> When union two Dataset<Row>, spark will check they have same number of columns. But if the order of column is different, strange result will be generated.
> The output of the following code shows that column have being switched by Spark.
> ================= Code =============
> package test;
> import java.util.ArrayList;
> import java.util.List;
> import org.apache.spark.sql.Dataset;
> import org.apache.spark.sql.Row;
> import org.apache.spark.sql.SparkSession;
> import org.apache.spark.sql.types.DataTypes;
> import org.apache.spark.sql.types.Metadata;
> import org.apache.spark.sql.types.StructField;
> import org.apache.spark.sql.types.StructType;
> import audit_spark.SparkConfig;
> public class SchemaBug {
> 	public static class User {
> 		
> 		public User(long uid, long dateline) {
> 			this.uid = uid;
> 			this.dateline = dateline;
> 		}
> 		long uid;
> 		long dateline;
> 		public long getUid() {
> 			return uid;
> 		}
> 		public void setUid(long uid) {
> 			this.uid = uid;
> 		}
> 		public long getDateline() {
> 			return dateline;
> 		}
> 		public void setDateline(long dateline) {
> 			this.dateline = dateline;
> 		}
> 		
> 	}
> 	public static void main(String[] args) {
> 	
> 		SparkSession sparkSession = SparkSession
> 			    .builder()
> 			    .appName("test")
> 				.config("spark.sql.warehouse.dir", "file:///")
> 			    .getOrCreate();
> 		
> 		
> 		StructType userSchema2 = new StructType(new StructField[]{
> 				new StructField("uid", DataTypes.LongType, false, Metadata.empty()),
> 				new StructField("dateline", DataTypes.LongType, false, Metadata.empty()),
> 				
> 				});
> 		
> 		List userList = new ArrayList();
> 		userList.add(new User(1, System.currentTimeMillis()));
> 		userList.add(new User(2, System.currentTimeMillis()));
> 		Dataset<Row> ds1 = SparkConfig.sparkSession.createDataFrame(userList, User.class);
> 		Dataset<Row> ds2 = SparkConfig.sparkSession.createDataFrame(new ArrayList(), userSchema2);
> 		ds2.union(ds1).show();
> 	}
> }
> =========== Program Output ===============
> |          uid|dateline|
> |1476867071496|       1|
> |1476867071496|       2|
> =========== Expected Output ===============
> |       dateline   |uid|
> |1476867071496|       1|
> |1476867071496|       2|



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org