You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Kuldeep (JIRA)" <ji...@apache.org> on 2015/07/31 08:45:05 UTC

[jira] [Updated] (SPARK-9502) ArrayTypes incorrect for DataFrames Java API

     [ https://issues.apache.org/jira/browse/SPARK-9502?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Kuldeep updated SPARK-9502:
---------------------------
    Description: 
With upgrade to 1.4.1 array types for DataFrames were different in our java applications. I have modified JavaApplySchemaSuite to show the problem. Mainly i have added a list field to the person class.

{code:java}
  public static class Person implements Serializable {
    private String name;
    private int age;
    private List<String> skills;

    public String getName() {
      return name;
    }

    public void setName(String name) {
      this.name = name;
    }

    public int getAge() {
      return age;
    }

    public void setAge(int age) {
      this.age = age;
    }

    public void setSkills(List<String> skills) {
      this.skills = skills;
    }

    public List<String> getSkills() { return skills; }
  }

  @Test
  public void applySchema() {
    List<Person> personList = new ArrayList<Person>(2);
    List<String> skills = new ArrayList<String>();
    skills.add("eating");
    skills.add("sleeping");
    Person person1 = new Person();
    person1.setName("Michael");
    person1.setAge(29);
    person1.setSkills(skills);
    personList.add(person1);
    Person person2 = new Person();
    person2.setName("Yin");
    person2.setAge(28);
    person2.setSkills(skills);
    personList.add(person2);

    JavaRDD<Row> rowRDD = javaCtx.parallelize(personList).map(
      new Function<Person, Row>() {
        public Row call(Person person) throws Exception {
          return RowFactory.create(person.getName(), person.getAge(), person.getSkills());
        }
      });

    List<StructField> fields = new ArrayList<StructField>(2);
    fields.add(DataTypes.createStructField("name", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false));
    fields.add(DataTypes.createStructField("skills", DataTypes.createArrayType(DataTypes.StringType), false));
    StructType schema = DataTypes.createStructType(fields);

    DataFrame df = sqlContext.applySchema(rowRDD, schema);
    df.registerTempTable("people");
    Row[] actual = sqlContext.sql("SELECT * FROM people").collect();

      System.out.println(actual[1].get(2).getClass().getName());
      System.out.println(actual[1].get(2) instanceof List);
    List<Row> expected = new ArrayList<Row>(2);
    expected.add(RowFactory.create("Michael", 29, skills));
    expected.add(RowFactory.create("Yin", 28, skills));

    Assert.assertEquals(expected, Arrays.asList(actual));
  }
{code}

This prints 
scala.collection.immutable.$colon$colon
false

java.lang.AssertionError: 
Expected :[[Michael,29,[eating, sleeping]], [Yin,28,[eating, sleeping]]]
Actual   :[[Michael,29,List(eating, sleeping)], [Yin,28,List(eating, sleeping)]]

Not sure if this would be usable even in scala.

  was:
With upgrade to 1.4.1 array types for DataFrames were different in our java applications. I have modified JavaApplySchemaSuite to show the problem.

{code:java}
  public static class Person implements Serializable {
    private String name;
    private int age;
    private List<String> skills;

    public String getName() {
      return name;
    }

    public void setName(String name) {
      this.name = name;
    }

    public int getAge() {
      return age;
    }

    public void setAge(int age) {
      this.age = age;
    }

    public void setSkills(List<String> skills) {
      this.skills = skills;
    }

    public List<String> getSkills() { return skills; }
  }

  @Test
  public void applySchema() {
    List<Person> personList = new ArrayList<Person>(2);
    List<String> skills = new ArrayList<String>();
    skills.add("eating");
    skills.add("sleeping");
    Person person1 = new Person();
    person1.setName("Michael");
    person1.setAge(29);
    person1.setSkills(skills);
    personList.add(person1);
    Person person2 = new Person();
    person2.setName("Yin");
    person2.setAge(28);
    person2.setSkills(skills);
    personList.add(person2);

    JavaRDD<Row> rowRDD = javaCtx.parallelize(personList).map(
      new Function<Person, Row>() {
        public Row call(Person person) throws Exception {
          return RowFactory.create(person.getName(), person.getAge(), person.getSkills());
        }
      });

    List<StructField> fields = new ArrayList<StructField>(2);
    fields.add(DataTypes.createStructField("name", DataTypes.StringType, false));
    fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false));
    fields.add(DataTypes.createStructField("skills", DataTypes.createArrayType(DataTypes.StringType), false));
    StructType schema = DataTypes.createStructType(fields);

    DataFrame df = sqlContext.applySchema(rowRDD, schema);
    df.registerTempTable("people");
    Row[] actual = sqlContext.sql("SELECT * FROM people").collect();

      System.out.println(actual[1].get(2).getClass().getName());
      System.out.println(actual[1].get(2) instanceof List);
    List<Row> expected = new ArrayList<Row>(2);
    expected.add(RowFactory.create("Michael", 29, skills));
    expected.add(RowFactory.create("Yin", 28, skills));

    Assert.assertEquals(expected, Arrays.asList(actual));
  }
{code}

This prints 
scala.collection.immutable.$colon$colon
false

java.lang.AssertionError: 
Expected :[[Michael,29,[eating, sleeping]], [Yin,28,[eating, sleeping]]]
Actual   :[[Michael,29,List(eating, sleeping)], [Yin,28,List(eating, sleeping)]]

Not sure if this would be usable even in scala.


> ArrayTypes incorrect for DataFrames Java API
> --------------------------------------------
>
>                 Key: SPARK-9502
>                 URL: https://issues.apache.org/jira/browse/SPARK-9502
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 1.4.1
>            Reporter: Kuldeep
>            Priority: Critical
>
> With upgrade to 1.4.1 array types for DataFrames were different in our java applications. I have modified JavaApplySchemaSuite to show the problem. Mainly i have added a list field to the person class.
> {code:java}
>   public static class Person implements Serializable {
>     private String name;
>     private int age;
>     private List<String> skills;
>     public String getName() {
>       return name;
>     }
>     public void setName(String name) {
>       this.name = name;
>     }
>     public int getAge() {
>       return age;
>     }
>     public void setAge(int age) {
>       this.age = age;
>     }
>     public void setSkills(List<String> skills) {
>       this.skills = skills;
>     }
>     public List<String> getSkills() { return skills; }
>   }
>   @Test
>   public void applySchema() {
>     List<Person> personList = new ArrayList<Person>(2);
>     List<String> skills = new ArrayList<String>();
>     skills.add("eating");
>     skills.add("sleeping");
>     Person person1 = new Person();
>     person1.setName("Michael");
>     person1.setAge(29);
>     person1.setSkills(skills);
>     personList.add(person1);
>     Person person2 = new Person();
>     person2.setName("Yin");
>     person2.setAge(28);
>     person2.setSkills(skills);
>     personList.add(person2);
>     JavaRDD<Row> rowRDD = javaCtx.parallelize(personList).map(
>       new Function<Person, Row>() {
>         public Row call(Person person) throws Exception {
>           return RowFactory.create(person.getName(), person.getAge(), person.getSkills());
>         }
>       });
>     List<StructField> fields = new ArrayList<StructField>(2);
>     fields.add(DataTypes.createStructField("name", DataTypes.StringType, false));
>     fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false));
>     fields.add(DataTypes.createStructField("skills", DataTypes.createArrayType(DataTypes.StringType), false));
>     StructType schema = DataTypes.createStructType(fields);
>     DataFrame df = sqlContext.applySchema(rowRDD, schema);
>     df.registerTempTable("people");
>     Row[] actual = sqlContext.sql("SELECT * FROM people").collect();
>       System.out.println(actual[1].get(2).getClass().getName());
>       System.out.println(actual[1].get(2) instanceof List);
>     List<Row> expected = new ArrayList<Row>(2);
>     expected.add(RowFactory.create("Michael", 29, skills));
>     expected.add(RowFactory.create("Yin", 28, skills));
>     Assert.assertEquals(expected, Arrays.asList(actual));
>   }
> {code}
> This prints 
> scala.collection.immutable.$colon$colon
> false
> java.lang.AssertionError: 
> Expected :[[Michael,29,[eating, sleeping]], [Yin,28,[eating, sleeping]]]
> Actual   :[[Michael,29,List(eating, sleeping)], [Yin,28,List(eating, sleeping)]]
> Not sure if this would be usable even in scala.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org