You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues-all@impala.apache.org by "Tim Armstrong (JIRA)" <ji...@apache.org> on 2018/10/30 21:19:00 UTC
[jira] [Updated] (IMPALA-5861)
HdfsParquetScanner::GetNextInternal() IsZeroSlotTableScan() case double
counts
[ https://issues.apache.org/jira/browse/IMPALA-5861?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Tim Armstrong updated IMPALA-5861:
----------------------------------
Description:
It appears that this code is double counting into {{rows_read_counter()}}, since {{row_group_rows_read_}} is already accumulating:
{code:title=HdfsParquetScanner::GetNextInternal()}
} else if (scan_node_->IsZeroSlotTableScan()) {
// There are no materialized slots and we are not optimizing count(*), e.g.
// "select 1 from alltypes". We can serve this query from just the file metadata.
// We don't need to read the column data.
if (row_group_rows_read_ == file_metadata_.num_rows) {
eos_ = true;
return Status::OK();
}
assemble_rows_timer_.Start();
DCHECK_LE(row_group_rows_read_, file_metadata_.num_rows);
int64_t rows_remaining = file_metadata_.num_rows - row_group_rows_read_;
int max_tuples = min<int64_t>(row_batch->capacity(), rows_remaining);
TupleRow* current_row = row_batch->GetRow(row_batch->AddRow());
int num_to_commit = WriteTemplateTuples(current_row, max_tuples);
Status status = CommitRows(row_batch, num_to_commit);
assemble_rows_timer_.Stop();
RETURN_IF_ERROR(status);
row_group_rows_read_ += num_to_commit;
COUNTER_ADD(scan_node_->rows_read_counter(), row_group_rows_read_); <======
return Status::OK();
}
{code}
Repro in impala-shell:
{noformat}
set batch_size=16; set num_nodes=1; select count(*) from functional.alltypesmixedformat; profile
....
- RowsRead: 3.94K (3936)
- RowsReturned: 1.20K (1200)
{noformat}
was:
It appears that this code is double counting into {{rows_read_counter()}}, since {{row_group_rows_read_}} is already accumulating:
{code:title=HdfsParquetScanner::GetNextInternal()}
} else if (scan_node_->IsZeroSlotTableScan()) {
// There are no materialized slots and we are not optimizing count(*), e.g.
// "select 1 from alltypes". We can serve this query from just the file metadata.
// We don't need to read the column data.
if (row_group_rows_read_ == file_metadata_.num_rows) {
eos_ = true;
return Status::OK();
}
assemble_rows_timer_.Start();
DCHECK_LE(row_group_rows_read_, file_metadata_.num_rows);
int64_t rows_remaining = file_metadata_.num_rows - row_group_rows_read_;
int max_tuples = min<int64_t>(row_batch->capacity(), rows_remaining);
TupleRow* current_row = row_batch->GetRow(row_batch->AddRow());
int num_to_commit = WriteTemplateTuples(current_row, max_tuples);
Status status = CommitRows(row_batch, num_to_commit);
assemble_rows_timer_.Stop();
RETURN_IF_ERROR(status);
row_group_rows_read_ += num_to_commit;
COUNTER_ADD(scan_node_->rows_read_counter(), row_group_rows_read_); <======
return Status::OK();
}
{code}
> HdfsParquetScanner::GetNextInternal() IsZeroSlotTableScan() case double counts
> ------------------------------------------------------------------------------
>
> Key: IMPALA-5861
> URL: https://issues.apache.org/jira/browse/IMPALA-5861
> Project: IMPALA
> Issue Type: Bug
> Components: Backend
> Affects Versions: Impala 2.10.0
> Reporter: Dan Hecht
> Priority: Major
>
> It appears that this code is double counting into {{rows_read_counter()}}, since {{row_group_rows_read_}} is already accumulating:
> {code:title=HdfsParquetScanner::GetNextInternal()}
> } else if (scan_node_->IsZeroSlotTableScan()) {
> // There are no materialized slots and we are not optimizing count(*), e.g.
> // "select 1 from alltypes". We can serve this query from just the file metadata.
> // We don't need to read the column data.
> if (row_group_rows_read_ == file_metadata_.num_rows) {
> eos_ = true;
> return Status::OK();
> }
> assemble_rows_timer_.Start();
> DCHECK_LE(row_group_rows_read_, file_metadata_.num_rows);
> int64_t rows_remaining = file_metadata_.num_rows - row_group_rows_read_;
> int max_tuples = min<int64_t>(row_batch->capacity(), rows_remaining);
> TupleRow* current_row = row_batch->GetRow(row_batch->AddRow());
> int num_to_commit = WriteTemplateTuples(current_row, max_tuples);
> Status status = CommitRows(row_batch, num_to_commit);
> assemble_rows_timer_.Stop();
> RETURN_IF_ERROR(status);
> row_group_rows_read_ += num_to_commit;
> COUNTER_ADD(scan_node_->rows_read_counter(), row_group_rows_read_); <======
> return Status::OK();
> }
> {code}
> Repro in impala-shell:
> {noformat}
> set batch_size=16; set num_nodes=1; select count(*) from functional.alltypesmixedformat; profile
> ....
> - RowsRead: 3.94K (3936)
> - RowsReturned: 1.20K (1200)
> {noformat}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-all-unsubscribe@impala.apache.org
For additional commands, e-mail: issues-all-help@impala.apache.org