You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by da...@apache.org on 2022/07/05 14:19:03 UTC

[doris] branch master updated: [fix](load) skip empty orc file (#10593)

This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 8e364fb848 [fix](load) skip empty orc file (#10593)
8e364fb848 is described below

commit 8e364fb84828d483b51eac8d9aaca393ee85b7f8
Author: Mingyu Chen <mo...@gmail.com>
AuthorDate: Tue Jul 5 22:18:56 2022 +0800

    [fix](load) skip empty orc file (#10593)
    
    Something the upstream system(eg, hive) may create empty orc file
    which only has a header and footer, without schema.
    And if we call `_reader->createRowReader()` with selected columns,
    it will throw ParserError: Invalid column selected xx.
    So here we first check its number of rows and skip these kind of files.
    
    This is only a fix for non-vec load, for vec load, it use arrow scanner
    to read orc file, which does not have this problem.
---
 be/src/exec/orc_scanner.cpp      | 9 +++++++++
 be/src/vec/exec/vorc_scanner.cpp | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/be/src/exec/orc_scanner.cpp b/be/src/exec/orc_scanner.cpp
index 549132cb17..5edf9e75e3 100644
--- a/be/src/exec/orc_scanner.cpp
+++ b/be/src/exec/orc_scanner.cpp
@@ -391,6 +391,15 @@ Status ORCScanner::open_next_reader() {
                 new ORCFileStream(file_reader.release(), range.path));
         _reader = orc::createReader(std::move(inStream), _options);
 
+        // Something the upstream system(eg, hive) may create empty orc file
+        // which only has a header and footer, without schema.
+        // And if we call `_reader->createRowReader()` with selected columns,
+        // it will throw ParserError: Invalid column selected xx.
+        // So here we first check its number of rows and skip these kind of files.
+        if (_reader->getNumberOfRows() == 0) {
+            continue;
+        }
+
         _total_groups = _reader->getNumberOfStripes();
         _current_group = 0;
         _rows_of_group = 0;
diff --git a/be/src/vec/exec/vorc_scanner.cpp b/be/src/vec/exec/vorc_scanner.cpp
index 7521634183..ca5c7c2aef 100644
--- a/be/src/vec/exec/vorc_scanner.cpp
+++ b/be/src/vec/exec/vorc_scanner.cpp
@@ -34,4 +34,4 @@ ArrowReaderWrap* VORCScanner::_new_arrow_reader(FileReader* file_reader, int64_t
     return new ORCReaderWrap(file_reader, batch_size, num_of_columns_from_file);
 }
 
-} // namespace doris::vectorized
\ No newline at end of file
+} // namespace doris::vectorized


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org