You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Sean Owen (JIRA)" <ji...@apache.org> on 2017/03/09 08:33:38 UTC
[jira] [Comment Edited] (SPARK-19875) Map->filter on many columns gets stuck in constraint inference optimization code

    [ https://issues.apache.org/jira/browse/SPARK-19875?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15902668#comment-15902668 ] 

Sean Owen edited comment on SPARK-19875 at 3/9/17 8:32 AM:
-----------------------------------------------------------

It's easier to inline the code in a comment:

{code}
package test.spark

import org.apache.spark.SparkConf
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession

object TestFilter extends App {

  val conf = new SparkConf().setMaster("local[1]").setAppName("tester")

  val session = SparkSession.builder().config(conf).getOrCreate()
  val sc = session.sparkContext
  val sqlContext = session.sqlContext

  val df = sqlContext.read.format("csv").load("test50cols.csv")

  // some map operation on all columns
  val cols = df.columns.map { col => upper(df.col(col))  }
  val df2 = df.select(cols: _*)

  // filter header
  val filter = (0 until df.columns.length)
    .foldLeft(lit(false))((e, index) => e.or(df2.col(df2.columns(index)) =!= s"COLUMN${index+1}"))
  val df3 = df2.filter(filter)

  // some filter operation
  val df4 = df3.filter(df3.col(df3.columns(0)).isNotNull)

  df4.show(100)  // stuck here with a 50 column dataset

}
{code}

What do you mean it gets stuck -- do you have a thread dump?


was (Author: srowen):
It's easier to inline the code in a comment:

{code:scala}
package test.spark

import org.apache.spark.SparkConf
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession

object TestFilter extends App {

  val conf = new SparkConf().setMaster("local[1]").setAppName("tester")

  val session = SparkSession.builder().config(conf).getOrCreate()
  val sc = session.sparkContext
  val sqlContext = session.sqlContext

  val df = sqlContext.read.format("csv").load("test50cols.csv")

  // some map operation on all columns
  val cols = df.columns.map { col => upper(df.col(col))  }
  val df2 = df.select(cols: _*)

  // filter header
  val filter = (0 until df.columns.length)
    .foldLeft(lit(false))((e, index) => e.or(df2.col(df2.columns(index)) =!= s"COLUMN${index+1}"))
  val df3 = df2.filter(filter)

  // some filter operation
  val df4 = df3.filter(df3.col(df3.columns(0)).isNotNull)

  df4.show(100)  // stuck here with a 50 column dataset

}
{code}

What do you mean it gets stuck -- do you have a thread dump?

> Map->filter on many columns gets stuck in constraint inference optimization code
> --------------------------------------------------------------------------------
>
>                 Key: SPARK-19875
>                 URL: https://issues.apache.org/jira/browse/SPARK-19875
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 2.1.0
>            Reporter: Jay Pranavamurthi
>         Attachments: test10cols.csv, test50cols.csv, TestFilter.scala
>
>
> The attached code (TestFilter.scala) works with a 10-column csv dataset, but gets stuck with a 50-column csv dataset. Both datasets are attached.



--
This message was sent by Atlassian JIRA
(v6.3.15#6346)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org