You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by dz...@apache.org on 2023/04/18 15:26:36 UTC
[drill] 08/15: DRILL-8411: GoogleSheets Reader Will Not Read More than 1K Rows (#2774)
This is an automated email from the ASF dual-hosted git repository.
dzamo pushed a commit to branch 1.21
in repository https://gitbox.apache.org/repos/asf/drill.git
commit fd227154adfb5cd66d7e01f10c4401bbe776e978
Author: Charles S. Givre <cg...@apache.org>
AuthorDate: Wed Mar 15 13:42:38 2023 -0400
DRILL-8411: GoogleSheets Reader Will Not Read More than 1K Rows (#2774)
---
.../java/org/apache/drill/common/Typifier.java | 4 ++--
.../googlesheets/GoogleSheetsBatchReader.java | 23 +++++++++++-----------
.../columns/GoogleSheetsColumnWriter.java | 6 ++++--
.../utils/GoogleSheetsRangeBuilder.java | 9 ++++++++-
.../drill/exec/fn/impl/TestAggregateFunctions.java | 2 +-
5 files changed, 27 insertions(+), 17 deletions(-)
diff --git a/common/src/main/java/org/apache/drill/common/Typifier.java b/common/src/main/java/org/apache/drill/common/Typifier.java
index 1c4b5701f2..5a416fd828 100644
--- a/common/src/main/java/org/apache/drill/common/Typifier.java
+++ b/common/src/main/java/org/apache/drill/common/Typifier.java
@@ -272,7 +272,7 @@ public class Typifier {
* @param date Input date string
* @return LocalDateTime representation of the input String.
*/
- private static LocalDateTime stringAsDateTime(String date) {
+ public static LocalDateTime stringAsDateTime(String date) {
for (DateTimeFormatter format : formats) {
try {
return LocalDateTime.parse(date, format);
@@ -289,7 +289,7 @@ public class Typifier {
* @param date Input date string
* @return LocalDateTime representation of the input String.
*/
- private static LocalDate stringAsDate(String date) {
+ public static LocalDate stringAsDate(String date) {
for (DateTimeFormatter format : dateFormats) {
try {
return LocalDate.parse(date, format);
diff --git a/contrib/storage-googlesheets/src/main/java/org/apache/drill/exec/store/googlesheets/GoogleSheetsBatchReader.java b/contrib/storage-googlesheets/src/main/java/org/apache/drill/exec/store/googlesheets/GoogleSheetsBatchReader.java
index db518fdb6a..90eebe8507 100644
--- a/contrib/storage-googlesheets/src/main/java/org/apache/drill/exec/store/googlesheets/GoogleSheetsBatchReader.java
+++ b/contrib/storage-googlesheets/src/main/java/org/apache/drill/exec/store/googlesheets/GoogleSheetsBatchReader.java
@@ -62,7 +62,7 @@ public class GoogleSheetsBatchReader implements ManagedReader<SchemaNegotiator>
// The default batch size is 1k rows. It appears that Google sets the maximum batch size at 1000
// rows. There is conflicting information about this online, but during testing, ranges with more than
// 1000 rows would throw invalid request errors.
- private static final int BATCH_SIZE = 1000;
+ protected static final int BATCH_SIZE = 1000;
private static final String SHEET_COLUMN_NAME = "_sheets";
private static final String TITLE_COLUMN_NAME = "_title";
@@ -225,12 +225,7 @@ public class GoogleSheetsBatchReader implements ManagedReader<SchemaNegotiator>
@Override
public boolean next() {
logger.debug("Processing batch.");
- while (!rowWriter.isFull()) {
- if (!processRow()) {
- return false;
- }
- }
- return true;
+ return processRow();
}
private boolean processRow() {
@@ -240,12 +235,16 @@ public class GoogleSheetsBatchReader implements ManagedReader<SchemaNegotiator>
// Get next range
String range = rangeBuilder.next();
if (range == null) {
+ rangeBuilder.lastBatch();
return false;
}
data = GoogleSheetsUtils.getDataFromRange(service, sheetID, range);
} else {
List<String> batches = rangeBuilder.nextBatch();
- if (!batches.isEmpty()) {
+ if (batches == null) {
+ rangeBuilder.lastBatch();
+ return false;
+ } else if (!batches.isEmpty()) {
data = GoogleSheetsUtils.getBatchData(service, sheetID, batches);
} else {
data = Collections.emptyList();
@@ -293,12 +292,14 @@ public class GoogleSheetsBatchReader implements ManagedReader<SchemaNegotiator>
rowWriter.save();
}
- // If the results contained less than the batch size, stop iterating.
- if (rowWriter.rowCount() < BATCH_SIZE) {
+ // If there is another batch, return true
+ if (rowWriter.rowCount() + BATCH_SIZE < rangeBuilder.getRowCount()) {
+ return true;
+ } else {
+ // If the results contained less than the batch size, stop iterating.
rangeBuilder.lastBatch();
return false;
}
- return true;
}
private void projectMetadata() {
diff --git a/contrib/storage-googlesheets/src/main/java/org/apache/drill/exec/store/googlesheets/columns/GoogleSheetsColumnWriter.java b/contrib/storage-googlesheets/src/main/java/org/apache/drill/exec/store/googlesheets/columns/GoogleSheetsColumnWriter.java
index 471106c61d..37aa8cdfb6 100644
--- a/contrib/storage-googlesheets/src/main/java/org/apache/drill/exec/store/googlesheets/columns/GoogleSheetsColumnWriter.java
+++ b/contrib/storage-googlesheets/src/main/java/org/apache/drill/exec/store/googlesheets/columns/GoogleSheetsColumnWriter.java
@@ -19,6 +19,7 @@
package org.apache.drill.exec.store.googlesheets.columns;
import org.apache.commons.lang3.StringUtils;
+import org.apache.drill.common.Typifier;
import org.apache.drill.exec.physical.resultSet.RowSetLoader;
import org.apache.drill.exec.vector.accessor.ScalarWriter;
import org.slf4j.Logger;
@@ -27,6 +28,7 @@ import org.slf4j.LoggerFactory;
import java.time.Instant;
import java.time.LocalDate;
import java.time.LocalTime;
+import java.time.ZoneOffset;
public abstract class GoogleSheetsColumnWriter {
protected static final Logger logger = LoggerFactory.getLogger(GoogleSheetsColumnWriter.class);
@@ -106,7 +108,7 @@ public abstract class GoogleSheetsColumnWriter {
if (StringUtils.isNotEmpty(stringValue)) {
LocalDate finalValue;
try {
- finalValue = LocalDate.parse(stringValue);
+ finalValue = Typifier.stringAsDate(stringValue);
} catch (NumberFormatException e) {
finalValue = null;
}
@@ -211,7 +213,7 @@ public abstract class GoogleSheetsColumnWriter {
if (StringUtils.isNotEmpty(stringValue)) {
Instant finalValue;
try {
- finalValue = Instant.parse(stringValue);
+ finalValue = Typifier.stringAsDateTime(stringValue).toInstant(ZoneOffset.UTC);
} catch (NumberFormatException e) {
finalValue = null;
}
diff --git a/contrib/storage-googlesheets/src/main/java/org/apache/drill/exec/store/googlesheets/utils/GoogleSheetsRangeBuilder.java b/contrib/storage-googlesheets/src/main/java/org/apache/drill/exec/store/googlesheets/utils/GoogleSheetsRangeBuilder.java
index f5e8417db5..eb03193bad 100644
--- a/contrib/storage-googlesheets/src/main/java/org/apache/drill/exec/store/googlesheets/utils/GoogleSheetsRangeBuilder.java
+++ b/contrib/storage-googlesheets/src/main/java/org/apache/drill/exec/store/googlesheets/utils/GoogleSheetsRangeBuilder.java
@@ -146,7 +146,7 @@ public class GoogleSheetsRangeBuilder implements Iterator<String> {
if (!hasMore) {
return null;
} else if (getStartIndex() > getEndIndex()) {
- hasMore = false;
+ lastBatch();
return null;
}
@@ -197,6 +197,9 @@ public class GoogleSheetsRangeBuilder implements Iterator<String> {
private List<String> buildBatchList() {
if (isStarQuery) {
return null;
+ } else if (getStartIndex() > getEndIndex()) {
+ hasMore = false;
+ return null;
}
List<String> batchList = new ArrayList<>();
@@ -238,6 +241,10 @@ public class GoogleSheetsRangeBuilder implements Iterator<String> {
return buildBatchList();
}
+ public int getRowCount() {
+ return rowCount;
+ }
+
@Override
public String toString() {
return new PlanStringBuilder(this)
diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestAggregateFunctions.java b/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestAggregateFunctions.java
index edf619074b..804aea35d1 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestAggregateFunctions.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/exec/fn/impl/TestAggregateFunctions.java
@@ -1275,7 +1275,7 @@ public class TestAggregateFunctions extends ClusterTest {
@Test
public void testAggregateWithPivot() throws Exception {
String query = "SELECT * FROM (\n" +
- "SELECT education_level, salary, marital_status, extract(year from age(birth_date)) age\n" +
+ "SELECT education_level, salary, marital_status, extract(year from age('2023-02-23', birth_date)) age\n" +
"FROM cp.`employee.json`)\n" +
"PIVOT (avg(salary) avg_salary, avg(age) avg_age FOR marital_status IN ('M' married, 'S' single))";
testBuilder()