You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Kuldeep (JIRA)" <ji...@apache.org> on 2015/07/31 08:45:05 UTC
[jira] [Updated] (SPARK-9502) ArrayTypes incorrect for DataFrames
Java API
[ https://issues.apache.org/jira/browse/SPARK-9502?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Kuldeep updated SPARK-9502:
---------------------------
Description:
With upgrade to 1.4.1 array types for DataFrames were different in our java applications. I have modified JavaApplySchemaSuite to show the problem. Mainly i have added a list field to the person class.
{code:java}
public static class Person implements Serializable {
private String name;
private int age;
private List<String> skills;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getAge() {
return age;
}
public void setAge(int age) {
this.age = age;
}
public void setSkills(List<String> skills) {
this.skills = skills;
}
public List<String> getSkills() { return skills; }
}
@Test
public void applySchema() {
List<Person> personList = new ArrayList<Person>(2);
List<String> skills = new ArrayList<String>();
skills.add("eating");
skills.add("sleeping");
Person person1 = new Person();
person1.setName("Michael");
person1.setAge(29);
person1.setSkills(skills);
personList.add(person1);
Person person2 = new Person();
person2.setName("Yin");
person2.setAge(28);
person2.setSkills(skills);
personList.add(person2);
JavaRDD<Row> rowRDD = javaCtx.parallelize(personList).map(
new Function<Person, Row>() {
public Row call(Person person) throws Exception {
return RowFactory.create(person.getName(), person.getAge(), person.getSkills());
}
});
List<StructField> fields = new ArrayList<StructField>(2);
fields.add(DataTypes.createStructField("name", DataTypes.StringType, false));
fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false));
fields.add(DataTypes.createStructField("skills", DataTypes.createArrayType(DataTypes.StringType), false));
StructType schema = DataTypes.createStructType(fields);
DataFrame df = sqlContext.applySchema(rowRDD, schema);
df.registerTempTable("people");
Row[] actual = sqlContext.sql("SELECT * FROM people").collect();
System.out.println(actual[1].get(2).getClass().getName());
System.out.println(actual[1].get(2) instanceof List);
List<Row> expected = new ArrayList<Row>(2);
expected.add(RowFactory.create("Michael", 29, skills));
expected.add(RowFactory.create("Yin", 28, skills));
Assert.assertEquals(expected, Arrays.asList(actual));
}
{code}
This prints
scala.collection.immutable.$colon$colon
false
java.lang.AssertionError:
Expected :[[Michael,29,[eating, sleeping]], [Yin,28,[eating, sleeping]]]
Actual :[[Michael,29,List(eating, sleeping)], [Yin,28,List(eating, sleeping)]]
Not sure if this would be usable even in scala.
was:
With upgrade to 1.4.1 array types for DataFrames were different in our java applications. I have modified JavaApplySchemaSuite to show the problem.
{code:java}
public static class Person implements Serializable {
private String name;
private int age;
private List<String> skills;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getAge() {
return age;
}
public void setAge(int age) {
this.age = age;
}
public void setSkills(List<String> skills) {
this.skills = skills;
}
public List<String> getSkills() { return skills; }
}
@Test
public void applySchema() {
List<Person> personList = new ArrayList<Person>(2);
List<String> skills = new ArrayList<String>();
skills.add("eating");
skills.add("sleeping");
Person person1 = new Person();
person1.setName("Michael");
person1.setAge(29);
person1.setSkills(skills);
personList.add(person1);
Person person2 = new Person();
person2.setName("Yin");
person2.setAge(28);
person2.setSkills(skills);
personList.add(person2);
JavaRDD<Row> rowRDD = javaCtx.parallelize(personList).map(
new Function<Person, Row>() {
public Row call(Person person) throws Exception {
return RowFactory.create(person.getName(), person.getAge(), person.getSkills());
}
});
List<StructField> fields = new ArrayList<StructField>(2);
fields.add(DataTypes.createStructField("name", DataTypes.StringType, false));
fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false));
fields.add(DataTypes.createStructField("skills", DataTypes.createArrayType(DataTypes.StringType), false));
StructType schema = DataTypes.createStructType(fields);
DataFrame df = sqlContext.applySchema(rowRDD, schema);
df.registerTempTable("people");
Row[] actual = sqlContext.sql("SELECT * FROM people").collect();
System.out.println(actual[1].get(2).getClass().getName());
System.out.println(actual[1].get(2) instanceof List);
List<Row> expected = new ArrayList<Row>(2);
expected.add(RowFactory.create("Michael", 29, skills));
expected.add(RowFactory.create("Yin", 28, skills));
Assert.assertEquals(expected, Arrays.asList(actual));
}
{code}
This prints
scala.collection.immutable.$colon$colon
false
java.lang.AssertionError:
Expected :[[Michael,29,[eating, sleeping]], [Yin,28,[eating, sleeping]]]
Actual :[[Michael,29,List(eating, sleeping)], [Yin,28,List(eating, sleeping)]]
Not sure if this would be usable even in scala.
> ArrayTypes incorrect for DataFrames Java API
> --------------------------------------------
>
> Key: SPARK-9502
> URL: https://issues.apache.org/jira/browse/SPARK-9502
> Project: Spark
> Issue Type: Bug
> Components: SQL
> Affects Versions: 1.4.1
> Reporter: Kuldeep
> Priority: Critical
>
> With upgrade to 1.4.1 array types for DataFrames were different in our java applications. I have modified JavaApplySchemaSuite to show the problem. Mainly i have added a list field to the person class.
> {code:java}
> public static class Person implements Serializable {
> private String name;
> private int age;
> private List<String> skills;
> public String getName() {
> return name;
> }
> public void setName(String name) {
> this.name = name;
> }
> public int getAge() {
> return age;
> }
> public void setAge(int age) {
> this.age = age;
> }
> public void setSkills(List<String> skills) {
> this.skills = skills;
> }
> public List<String> getSkills() { return skills; }
> }
> @Test
> public void applySchema() {
> List<Person> personList = new ArrayList<Person>(2);
> List<String> skills = new ArrayList<String>();
> skills.add("eating");
> skills.add("sleeping");
> Person person1 = new Person();
> person1.setName("Michael");
> person1.setAge(29);
> person1.setSkills(skills);
> personList.add(person1);
> Person person2 = new Person();
> person2.setName("Yin");
> person2.setAge(28);
> person2.setSkills(skills);
> personList.add(person2);
> JavaRDD<Row> rowRDD = javaCtx.parallelize(personList).map(
> new Function<Person, Row>() {
> public Row call(Person person) throws Exception {
> return RowFactory.create(person.getName(), person.getAge(), person.getSkills());
> }
> });
> List<StructField> fields = new ArrayList<StructField>(2);
> fields.add(DataTypes.createStructField("name", DataTypes.StringType, false));
> fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false));
> fields.add(DataTypes.createStructField("skills", DataTypes.createArrayType(DataTypes.StringType), false));
> StructType schema = DataTypes.createStructType(fields);
> DataFrame df = sqlContext.applySchema(rowRDD, schema);
> df.registerTempTable("people");
> Row[] actual = sqlContext.sql("SELECT * FROM people").collect();
> System.out.println(actual[1].get(2).getClass().getName());
> System.out.println(actual[1].get(2) instanceof List);
> List<Row> expected = new ArrayList<Row>(2);
> expected.add(RowFactory.create("Michael", 29, skills));
> expected.add(RowFactory.create("Yin", 28, skills));
> Assert.assertEquals(expected, Arrays.asList(actual));
> }
> {code}
> This prints
> scala.collection.immutable.$colon$colon
> false
> java.lang.AssertionError:
> Expected :[[Michael,29,[eating, sleeping]], [Yin,28,[eating, sleeping]]]
> Actual :[[Michael,29,List(eating, sleeping)], [Yin,28,List(eating, sleeping)]]
> Not sure if this would be usable even in scala.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org