You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2021/10/28 14:24:15 UTC

[GitHub] [spark] dohongdayi commented on a change in pull request #34311: [SPARK-37038][SQL][WIP] DSV2 Sample Push Down

dohongdayi commented on a change in pull request #34311:
URL: https://github.com/apache/spark/pull/34311#discussion_r738266697



##########
File path: sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
##########
@@ -104,6 +105,7 @@ case class RowDataSourceScanExec(
     filters: Set[Filter],
     handledFilters: Set[Filter],
     aggregation: Option[Aggregation],
+    sample: Option[TableSample],

Review comment:
       So many pushdown related parameters, would it be better if they were wrapped by some parent case class?

##########
File path: sql/catalyst/src/main/scala/org/apache/spark/sql/connector/expressions/expressions.scala
##########
@@ -357,3 +366,14 @@ private[sql] object SortValue {
       None
   }
 }
+
+private[sql] final case class TableSampleValue(
+    methodName: String,
+    lowerBound: Double,
+    upperBound: Double,
+    withReplacement: Boolean,
+    seed: Long) extends TableSample {
+
+  override def describe(): String = s"$methodName $lowerBound $lowerBound $upperBound" +

Review comment:
       two `lowerBound`s ? 

##########
File path: sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala
##########
@@ -154,4 +155,21 @@ private object PostgresDialect extends JdbcDialect {
     val nullable = if (isNullable) "DROP NOT NULL" else "SET NOT NULL"
     s"ALTER TABLE $tableName ALTER COLUMN ${quoteIdentifier(columnName)} $nullable"
   }
+
+  override def supportsTableSample: Boolean = true
+
+  override def getTableSample(sample: Option[TableSample]): String = {
+    if (sample.nonEmpty) {
+      val method = if (sample.get.methodName.isEmpty) {

Review comment:
       If many of the dialects have default sample methods, would `Option[String]` be better type for `TableSample.methodName`?

##########
File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
##########
@@ -336,6 +336,7 @@ object DataSourceStrategy
         Set.empty,
         Set.empty,
         None,
+        null,

Review comment:
       I think here should be `None`

##########
File path: external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala
##########
@@ -284,4 +285,32 @@ private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFu
       testIndexUsingSQL(s"$catalogName.new_table")
     }
   }
+
+  def supportsTableSample: Boolean = false
+
+  test("Test TABLESAMPLE") {
+    withTable(s"$catalogName.new_table") {
+      sql(s"CREATE TABLE $catalogName.new_table (col1 INT, col2 INT)")
+      sql(s"INSERT INTO TABLE $catalogName.new_table values (1, 2)")
+      sql(s"INSERT INTO TABLE $catalogName.new_table values (3, 4)")
+      sql(s"INSERT INTO TABLE $catalogName.new_table values (5, 6)")
+      sql(s"INSERT INTO TABLE $catalogName.new_table values (7, 8)")
+      sql(s"INSERT INTO TABLE $catalogName.new_table values (9, 10)")
+      sql(s"INSERT INTO TABLE $catalogName.new_table values (11, 12)")
+      sql(s"INSERT INTO TABLE $catalogName.new_table values (13, 14)")
+      sql(s"INSERT INTO TABLE $catalogName.new_table values (15, 16)")
+      sql(s"INSERT INTO TABLE $catalogName.new_table values (17, 18)")
+      sql(s"INSERT INTO TABLE $catalogName.new_table values (19, 20)")
+      if (supportsTableSample) {

Review comment:
       If `supportsTableSample` was false, it would be no need to create testing table or insert testing data at all

##########
File path: sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
##########
@@ -358,6 +358,10 @@ abstract class JdbcDialect extends Serializable with Logging{
   def classifyException(message: String, e: Throwable): AnalysisException = {
     new AnalysisException(message, cause = Some(e))
   }
+
+  def supportsTableSample: Boolean = false

Review comment:
       Would `supportsTableSample()` need parameter `methodName: Option[String]`, for the dialect might not support the specified sample method or not support any sample method at all ?

##########
File path: sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
##########
@@ -104,6 +105,7 @@ case class RowDataSourceScanExec(
     filters: Set[Filter],
     handledFilters: Set[Filter],
     aggregation: Option[Aggregation],
+    sample: TableSample,

Review comment:
       Would `Option[TableSample]` be better type for sample might not be provided ?

##########
File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala
##########
@@ -225,6 +226,27 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] with PredicateHelper {
       withProjection
   }
 
+  def applySample(plan: LogicalPlan): LogicalPlan = plan.transform {
+    case sample @ Sample(_, _, _, _, child) => child match {
+        case ScanOperation(_, _, sHolder: ScanBuilderHolder) =>
+          val tableSample = LogicalExpressions.tableSample(
+            "",

Review comment:
       I didn't see any other possible value of `TableSample.methodName` beside `""` here, so I'm not sure that`TableSample.methodName` was important ? 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org