You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by st...@apache.org on 2022/10/12 19:57:29 UTC
[iceberg] branch master updated: API: Fix estimated row count in ContentScanTask (#5755)

This is an automated email from the ASF dual-hosted git repository.

stevenwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new f96e9b78c9 API: Fix estimated row count in ContentScanTask (#5755)
f96e9b78c9 is described below

commit f96e9b78c965ba520842772ab1e6845ee4b92357
Author: Wing Yew Poon <wy...@apache.org>
AuthorDate: Wed Oct 12 12:57:22 2022 -0700

    API: Fix estimated row count in ContentScanTask (#5755)
---
 .../java/org/apache/iceberg/ContentScanTask.java   |  3 +-
 .../apache/iceberg/spark/source/TestSparkScan.java | 79 ++++++++++++++++++++++
 2 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/api/src/main/java/org/apache/iceberg/ContentScanTask.java b/api/src/main/java/org/apache/iceberg/ContentScanTask.java
index 38de80fbd4..7b912ade4b 100644
--- a/api/src/main/java/org/apache/iceberg/ContentScanTask.java
+++ b/api/src/main/java/org/apache/iceberg/ContentScanTask.java
@@ -66,7 +66,8 @@ public interface ContentScanTask<F extends ContentFile<F>> extends ScanTask {
 
   @Override
   default long estimatedRowsCount() {
-    double scannedFileFraction = ((double) length()) / file().fileSizeInBytes();
+    long splitOffset = (file().splitOffsets() != null) ? file().splitOffsets().get(0) : 0L;
+    double scannedFileFraction = ((double) length()) / (file().fileSizeInBytes() - splitOffset);
     return (long) (scannedFileFraction * file().recordCount());
   }
 }
diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkScan.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkScan.java
new file mode 100644
index 0000000000..905a4e7dfe
--- /dev/null
+++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkScan.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.spark.source;
+
+import static org.apache.spark.sql.functions.date_add;
+import static org.apache.spark.sql.functions.expr;
+
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.spark.SparkTestBaseWithCatalog;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
+import org.apache.spark.sql.connector.read.Statistics;
+import org.apache.spark.sql.util.CaseInsensitiveStringMap;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class TestSparkScan extends SparkTestBaseWithCatalog {
+
+  private final String format;
+
+  @Parameterized.Parameters(name = "format = {0}")
+  public static Object[] parameters() {
+    return new Object[] {"parquet", "avro", "orc"};
+  }
+
+  public TestSparkScan(String format) {
+    this.format = format;
+  }
+
+  @After
+  public void removeTables() {
+    sql("DROP TABLE IF EXISTS %s", tableName);
+  }
+
+  @Test
+  public void testEstimatedRowCount() throws NoSuchTableException {
+    sql(
+        "CREATE TABLE %s (id BIGINT, date DATE) USING iceberg TBLPROPERTIES('%s' = '%s')",
+        tableName, TableProperties.DEFAULT_FILE_FORMAT, format);
+
+    Dataset<Row> df =
+        spark
+            .range(10000)
+            .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id AS INT)")))
+            .select("id", "date");
+
+    df.coalesce(1).writeTo(tableName).append();
+
+    Table table = validationCatalog.loadTable(tableIdent);
+    SparkScanBuilder scanBuilder =
+        new SparkScanBuilder(spark, table, CaseInsensitiveStringMap.empty());
+    SparkScan scan = (SparkScan) scanBuilder.build();
+    Statistics stats = scan.estimateStatistics();
+
+    Assert.assertEquals(10000L, stats.numRows().getAsLong());
+  }
+}