You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by st...@apache.org on 2022/10/12 19:57:29 UTC
[iceberg] branch master updated: API: Fix estimated row count in ContentScanTask (#5755)
This is an automated email from the ASF dual-hosted git repository.
stevenwu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new f96e9b78c9 API: Fix estimated row count in ContentScanTask (#5755)
f96e9b78c9 is described below
commit f96e9b78c965ba520842772ab1e6845ee4b92357
Author: Wing Yew Poon <wy...@apache.org>
AuthorDate: Wed Oct 12 12:57:22 2022 -0700
API: Fix estimated row count in ContentScanTask (#5755)
---
.../java/org/apache/iceberg/ContentScanTask.java | 3 +-
.../apache/iceberg/spark/source/TestSparkScan.java | 79 ++++++++++++++++++++++
2 files changed, 81 insertions(+), 1 deletion(-)
diff --git a/api/src/main/java/org/apache/iceberg/ContentScanTask.java b/api/src/main/java/org/apache/iceberg/ContentScanTask.java
index 38de80fbd4..7b912ade4b 100644
--- a/api/src/main/java/org/apache/iceberg/ContentScanTask.java
+++ b/api/src/main/java/org/apache/iceberg/ContentScanTask.java
@@ -66,7 +66,8 @@ public interface ContentScanTask<F extends ContentFile<F>> extends ScanTask {
@Override
default long estimatedRowsCount() {
- double scannedFileFraction = ((double) length()) / file().fileSizeInBytes();
+ long splitOffset = (file().splitOffsets() != null) ? file().splitOffsets().get(0) : 0L;
+ double scannedFileFraction = ((double) length()) / (file().fileSizeInBytes() - splitOffset);
return (long) (scannedFileFraction * file().recordCount());
}
}
diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkScan.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkScan.java
new file mode 100644
index 0000000000..905a4e7dfe
--- /dev/null
+++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkScan.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.spark.source;
+
+import static org.apache.spark.sql.functions.date_add;
+import static org.apache.spark.sql.functions.expr;
+
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.spark.SparkTestBaseWithCatalog;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
+import org.apache.spark.sql.connector.read.Statistics;
+import org.apache.spark.sql.util.CaseInsensitiveStringMap;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class TestSparkScan extends SparkTestBaseWithCatalog {
+
+ private final String format;
+
+ @Parameterized.Parameters(name = "format = {0}")
+ public static Object[] parameters() {
+ return new Object[] {"parquet", "avro", "orc"};
+ }
+
+ public TestSparkScan(String format) {
+ this.format = format;
+ }
+
+ @After
+ public void removeTables() {
+ sql("DROP TABLE IF EXISTS %s", tableName);
+ }
+
+ @Test
+ public void testEstimatedRowCount() throws NoSuchTableException {
+ sql(
+ "CREATE TABLE %s (id BIGINT, date DATE) USING iceberg TBLPROPERTIES('%s' = '%s')",
+ tableName, TableProperties.DEFAULT_FILE_FORMAT, format);
+
+ Dataset<Row> df =
+ spark
+ .range(10000)
+ .withColumn("date", date_add(expr("DATE '1970-01-01'"), expr("CAST(id AS INT)")))
+ .select("id", "date");
+
+ df.coalesce(1).writeTo(tableName).append();
+
+ Table table = validationCatalog.loadTable(tableIdent);
+ SparkScanBuilder scanBuilder =
+ new SparkScanBuilder(spark, table, CaseInsensitiveStringMap.empty());
+ SparkScan scan = (SparkScan) scanBuilder.build();
+ Statistics stats = scan.estimateStatistics();
+
+ Assert.assertEquals(10000L, stats.numRows().getAsLong());
+ }
+}