You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by bo...@apache.org on 2022/08/30 16:47:45 UTC
[impala] branch master updated: IMPALA-11529: FILE__POSITION virtual column for ORC tables
This is an automated email from the ASF dual-hosted git repository.
boroknagyz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new fec7a79c5 IMPALA-11529: FILE__POSITION virtual column for ORC tables
fec7a79c5 is described below
commit fec7a79c50de9ef5db2c652ece111aaa303d57c7
Author: Gabor Kaszab <ga...@cloudera.com>
AuthorDate: Mon Aug 22 13:50:46 2022 +0200
IMPALA-11529: FILE__POSITION virtual column for ORC tables
IMPALA-11350 implemented the FILE__POSITION virtual column for Parquet
files. This ticket does the same but for ORC files. Note, that for full
ACID ORC tables there have already been an implementation of row__id
that could simply be re-used for this ticket.
Testing:
- TestScannersVirtualColumns.test_virtual_column_file_position_generic
is changed to run now on ORC as well. I don't think further testing
is required as this functionality has already been there for row__id
we just re-used it for FILE__POSITION.
Change-Id: Ie8e951f73ceb910d64cd149192853a4a2131f79b
Reviewed-on: http://gerrit.cloudera.org:8080/18909
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
be/src/exec/hdfs-scanner.cc | 3 ++-
be/src/exec/orc/hdfs-orc-scanner.cc | 13 ++++++++++---
be/src/exec/orc/hdfs-orc-scanner.h | 6 ++++--
be/src/exec/orc/orc-column-readers.cc | 10 +++++-----
be/src/exec/orc/orc-column-readers.h | 2 +-
.../virtual-column-file-position-generic.test | 22 +++++++++++-----------
tests/query_test/test_scanners.py | 4 ++--
7 files changed, 35 insertions(+), 25 deletions(-)
diff --git a/be/src/exec/hdfs-scanner.cc b/be/src/exec/hdfs-scanner.cc
index 1b7e8f47c..7795a8c09 100644
--- a/be/src/exec/hdfs-scanner.cc
+++ b/be/src/exec/hdfs-scanner.cc
@@ -76,7 +76,8 @@ HdfsScanner::~HdfsScanner() {
}
Status HdfsScanner::ValidateSlotDescriptors() const {
- if (file_format() != THdfsFileFormat::PARQUET) {
+ if (file_format() != THdfsFileFormat::PARQUET &&
+ file_format() != THdfsFileFormat::ORC) {
// Virtual column FILE__POSITION is only supported for PARQUET files.
for (SlotDescriptor* sd : scan_node_->virtual_column_slots()) {
if (sd->virtual_column_type() == TVirtualColumnType::FILE_POSITION) {
diff --git a/be/src/exec/orc/hdfs-orc-scanner.cc b/be/src/exec/orc/hdfs-orc-scanner.cc
index ca62de143..ce04bf178 100644
--- a/be/src/exec/orc/hdfs-orc-scanner.cc
+++ b/be/src/exec/orc/hdfs-orc-scanner.cc
@@ -599,7 +599,14 @@ Status HdfsOrcScanner::ResolveColumns(const TupleDescriptor& tuple_desc,
SlotDescriptor* pos_slot_desc = nullptr;
for (SlotDescriptor* slot_desc : tuple_desc.slots()) {
// Skip columns not (necessarily) stored in the data files.
- if (!file_metadata_utils_.NeedDataInFile(slot_desc)) continue;
+ if (!file_metadata_utils_.NeedDataInFile(slot_desc)) {
+ if (slot_desc->virtual_column_type() == TVirtualColumnType::FILE_POSITION) {
+ DCHECK(pos_slot_desc == nullptr)
+ << "There should only be one position slot per tuple";
+ file_position_ = slot_desc;
+ }
+ continue;
+ }
node = nullptr;
pos_field = false;
@@ -676,7 +683,7 @@ void HdfsOrcScanner::SetSyntheticAcidFieldForOriginalFile(const SlotDescriptor*
ValidWriteIdList::GetBucketProperty(filename());
break;
case ACID_FIELD_ROWID_INDEX:
- acid_synthetic_rowid_ = slot_desc;
+ file_position_ = slot_desc;
default:
break;
}
@@ -934,7 +941,7 @@ Status HdfsOrcScanner::AssembleRows(RowBatch* row_batch) {
try {
end_of_stripe_ |= !row_reader_->next(*orc_root_batch_);
RETURN_IF_ERROR(orc_root_reader_->UpdateInputBatch(orc_root_batch_.get()));
- if (acid_synthetic_rowid_ != nullptr) {
+ if (file_position_ != nullptr) {
// Set the first row index of the batch. The ORC reader guarantees that rows
// are consecutive in the returned batch.
orc_root_reader_->SetFileRowIndex(row_reader_->getRowNumber());
diff --git a/be/src/exec/orc/hdfs-orc-scanner.h b/be/src/exec/orc/hdfs-orc-scanner.h
index 33ce0f875..7383f4942 100644
--- a/be/src/exec/orc/hdfs-orc-scanner.h
+++ b/be/src/exec/orc/hdfs-orc-scanner.h
@@ -271,8 +271,10 @@ class HdfsOrcScanner : public HdfsColumnarScanner {
/// Non-ACID file in full ACID table.
bool acid_original_file_ = false;
- /// Slot descriptor of synthetic rowid of original files.
- const SlotDescriptor* acid_synthetic_rowid_ = nullptr;
+ /// Slot descriptor for file position (a.k.a. row ID) for two different purposes: It can
+ /// be used for the synthetic rowid column in original files of a full ACID table, or
+ /// for the FILE__POSITION virtual column.
+ const SlotDescriptor* file_position_ = nullptr;
/// True if we need to validate the row batches against the valid write id list. This
/// only needs to be done for Hive Streaming Ingestion. The 'write id' will be the same
diff --git a/be/src/exec/orc/orc-column-readers.cc b/be/src/exec/orc/orc-column-readers.cc
index 5dfbb4f7c..aeb1cbd0c 100644
--- a/be/src/exec/orc/orc-column-readers.cc
+++ b/be/src/exec/orc/orc-column-readers.cc
@@ -537,20 +537,20 @@ Status OrcStructReader::TopLevelReadValueBatch(ScratchTupleBatch* scratch_batch,
NumElements() - row_idx_);
scratch_batch->num_tuples += num_rows_read;
}
- if (scanner_->acid_synthetic_rowid_ != nullptr) {
- FillSyntheticRowId(scratch_batch, scratch_batch_idx, num_rows_read);
+ if (scanner_->file_position_ != nullptr) {
+ FillVirtualRowIdColumn(scratch_batch, scratch_batch_idx, num_rows_read);
}
row_idx_ += num_rows_read;
return Status::OK();
}
-void OrcStructReader::FillSyntheticRowId(ScratchTupleBatch* scratch_batch,
+void OrcStructReader::FillVirtualRowIdColumn(ScratchTupleBatch* scratch_batch,
int scratch_batch_idx, int num_rows) {
- DCHECK(scanner_->acid_synthetic_rowid_ != nullptr);
+ DCHECK(scanner_->file_position_ != nullptr);
int tuple_size = OrcColumnReader::scanner_->tuple_byte_size();
uint8_t* first_tuple = scratch_batch->tuple_mem + scratch_batch_idx * tuple_size;
int64_t* first_slot = reinterpret_cast<Tuple*>(first_tuple)->GetBigIntSlot(
- scanner_->acid_synthetic_rowid_->tuple_offset());
+ scanner_->file_position_->tuple_offset());
StrideWriter<int64_t> out{first_slot, tuple_size};
for (int i = 0; i < num_rows; ++i) {
*out.Advance() = file_row_idx_++;
diff --git a/be/src/exec/orc/orc-column-readers.h b/be/src/exec/orc/orc-column-readers.h
index 7dd967a5c..f67e487de 100644
--- a/be/src/exec/orc/orc-column-readers.h
+++ b/be/src/exec/orc/orc-column-readers.h
@@ -630,7 +630,7 @@ class OrcStructReader : public OrcComplexColumnReader {
void SetFileRowIndex(int64_t file_row_idx) { file_row_idx_ = file_row_idx; }
private:
- void FillSyntheticRowId(ScratchTupleBatch* scratch_batch, int scratch_batch_idx,
+ void FillVirtualRowIdColumn(ScratchTupleBatch* scratch_batch, int scratch_batch_idx,
int num_rows);
orc::StructVectorBatch* batch_ = nullptr;
diff --git a/testdata/workloads/functional-query/queries/QueryTest/virtual-column-file-position-generic.test b/testdata/workloads/functional-query/queries/QueryTest/virtual-column-file-position-generic.test
index e0b13c8c7..bb06a353b 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/virtual-column-file-position-generic.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/virtual-column-file-position-generic.test
@@ -1,6 +1,6 @@
====
---- QUERY
-select file__position, * from functional_parquet.alltypestiny order by id;
+select file__position, * from alltypestiny order by id;
---- RESULTS
0,0,true,0,0,0,0,0,0,'01/01/09','0',2009-01-01 00:00:00,2009,1
1,1,false,1,1,1,10,1.100000023841858,10.1,'01/01/09','1',2009-01-01 00:01:00,2009,1
@@ -14,7 +14,7 @@ select file__position, * from functional_parquet.alltypestiny order by id;
BIGINT, INT, BOOLEAN, TINYINT, SMALLINT, INT, BIGINT, FLOAT, DOUBLE, STRING, STRING, TIMESTAMP, INT, INT
====
---- QUERY
-select file__position, * from functional_parquet.alltypestiny
+select file__position, * from alltypestiny
where file__position = 1
order by id;
---- RESULTS
@@ -26,7 +26,7 @@ order by id;
BIGINT, INT, BOOLEAN, TINYINT, SMALLINT, INT, BIGINT, FLOAT, DOUBLE, STRING, STRING, TIMESTAMP, INT, INT
====
---- QUERY
-select file__position, count(*) from functional_parquet.alltypestiny
+select file__position, count(*) from alltypestiny
group by file__position;
---- RESULTS
1,4
@@ -35,7 +35,7 @@ group by file__position;
BIGINT, BIGINT
====
---- QUERY
-select file__position, id from functional_parquet.alltypestiny
+select file__position, id from alltypestiny
order by id;
---- RESULTS
0,0
@@ -50,7 +50,7 @@ order by id;
BIGINT, INT
====
---- QUERY
-select file__position from functional_parquet.alltypestiny;
+select file__position from alltypestiny;
---- RESULTS
0
1
@@ -64,14 +64,14 @@ select file__position from functional_parquet.alltypestiny;
BIGINT
====
---- QUERY
-select max(file__position) from functional_parquet.alltypestiny;
+select max(file__position) from alltypestiny;
---- RESULTS
1
---- TYPES
BIGINT
====
---- QUERY
-select file__position, id from functional_parquet.complextypestbl
+select file__position, id from complextypestbl
order by id;
---- RESULTS
0,1
@@ -86,7 +86,7 @@ order by id;
BIGINT, BIGINT
====
---- QUERY
-select file__position, id, int_array from functional_parquet.complextypestbl;
+select file__position, id, int_array from complextypestbl;
---- RESULTS
0,1,'[1,2,3]'
1,2,'[NULL,1,2,NULL,3,NULL]'
@@ -100,7 +100,7 @@ select file__position, id, int_array from functional_parquet.complextypestbl;
BIGINT, BIGINT, STRING
====
---- QUERY
-select file__position, id, item from functional_parquet.complextypestbl c, c.int_array
+select file__position, id, item from complextypestbl c, c.int_array
order by id;
---- RESULTS
0,1,1
@@ -117,7 +117,7 @@ order by id;
BIGINT, BIGINT, INT
====
---- QUERY
-select file__position, id, item from functional_parquet.complextypestbl c, c.int_array_array;
+select file__position, id, item from complextypestbl c, c.int_array_array;
---- RESULTS
0,1,'[1,2]'
0,1,'[3,4]'
@@ -134,7 +134,7 @@ select file__position, id, item from functional_parquet.complextypestbl c, c.int
BIGINT, BIGINT, STRING
====
---- QUERY
-select file__position, id, i.item from functional_parquet.complextypestbl c, c.int_array_array a, a.item i
+select file__position, id, i.item from complextypestbl c, c.int_array_array a, a.item i
order by id;
---- RESULTS
0,1,1
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index 31c927e29..62c0ab3b8 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -150,8 +150,8 @@ class TestScannersVirtualColumns(ImpalaTestSuite):
def test_virtual_column_file_position_generic(self, vector):
# Generic tests about virtual column file position.
file_format = vector.get_value('table_format').file_format
- # TODO: add support for other file formats, especially ORC
- if file_format not in ['parquet']:
+ # TODO: add support for other file format
+ if file_format not in ['parquet', 'orc']:
pytest.skip()
self.run_test_case('QueryTest/virtual-column-file-position-generic', vector)