You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2017/08/08 19:02:15 UTC

[01/11] incubator-impala git commit: IMPALA-5484: Fix LICENSE issues discovered by IPMC in 2.9 vote

Repository: incubator-impala
Updated Branches:
  refs/heads/master c4f903033 -> f14e68c72


IMPALA-5484: Fix LICENSE issues discovered by IPMC in 2.9 vote

Change-Id: I0f98d1b2f514d7afdee8d86a45167905b272ca4d
Reviewed-on: http://gerrit.cloudera.org:8080/7600
Reviewed-by: Henry Robinson <he...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/3deb1a95
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/3deb1a95
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/3deb1a95

Branch: refs/heads/master
Commit: 3deb1a952919c08ed0d93b834bae59a4d73e93cd
Parents: c4f9030
Author: Jim Apple <jb...@apache.org>
Authored: Sun Aug 6 19:15:23 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Mon Aug 7 08:53:34 2017 +0000

----------------------------------------------------------------------
 LICENSE.txt | 3 +++
 1 file changed, 3 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3deb1a95/LICENSE.txt
----------------------------------------------------------------------
diff --git a/LICENSE.txt b/LICENSE.txt
index e5e6611..da12783 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -396,6 +396,7 @@ www/DataTables* and www/datatables*: MIT license
 
 --------------------------------------------------------------------------------
 
+shell/pkg_resources.py: Python Software License V2
 Parts of be/src/runtime/string-search.h: Python Software License V2
 
   Copyright (c) 2001 - 2016 Python Software Foundation; All Rights Reserved
@@ -765,3 +766,5 @@ Some portions of this module are derived from code from LevelDB
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 --------------------------------------------------------------------------------
+
+SHA-1 code from be/src/thirdparty/squeasel/squeasel.c: public domain


[04/11] incubator-impala git commit: IMPALA-4674: Part 2.5: Rename BufferedTupleStreamV2

Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/runtime/buffered-tuple-stream.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-tuple-stream.cc b/be/src/runtime/buffered-tuple-stream.cc
new file mode 100644
index 0000000..38dc44c
--- /dev/null
+++ b/be/src/runtime/buffered-tuple-stream.cc
@@ -0,0 +1,1084 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "runtime/buffered-tuple-stream.inline.h"
+
+#include <boost/bind.hpp>
+#include <gutil/strings/substitute.h>
+
+#include "runtime/bufferpool/reservation-tracker.h"
+#include "runtime/collection-value.h"
+#include "runtime/descriptors.h"
+#include "runtime/exec-env.h"
+#include "runtime/mem-tracker.h"
+#include "runtime/row-batch.h"
+#include "runtime/runtime-state.h"
+#include "runtime/string-value.h"
+#include "runtime/tuple-row.h"
+#include "util/bit-util.h"
+#include "util/debug-util.h"
+#include "util/runtime-profile-counters.h"
+
+#include "common/names.h"
+
+#ifdef NDEBUG
+#define CHECK_CONSISTENCY_FAST()
+#define CHECK_CONSISTENCY_FULL()
+#else
+#define CHECK_CONSISTENCY_FAST() CheckConsistencyFast()
+#define CHECK_CONSISTENCY_FULL() CheckConsistencyFull()
+#endif
+
+using namespace impala;
+using namespace strings;
+
+using BufferHandle = BufferPool::BufferHandle;
+
+BufferedTupleStream::BufferedTupleStream(RuntimeState* state,
+    const RowDescriptor* row_desc, BufferPool::ClientHandle* buffer_pool_client,
+    int64_t default_page_len, int64_t max_page_len, const set<SlotId>& ext_varlen_slots)
+  : state_(state),
+    desc_(row_desc),
+    node_id_(-1),
+    buffer_pool_(state->exec_env()->buffer_pool()),
+    buffer_pool_client_(buffer_pool_client),
+    num_pages_(0),
+    total_byte_size_(0),
+    has_read_iterator_(false),
+    read_page_reservation_(buffer_pool_client_),
+    read_page_rows_returned_(-1),
+    read_ptr_(nullptr),
+    read_end_ptr_(nullptr),
+    write_ptr_(nullptr),
+    write_end_ptr_(nullptr),
+    rows_returned_(0),
+    has_write_iterator_(false),
+    write_page_(nullptr),
+    write_page_reservation_(buffer_pool_client_),
+    bytes_pinned_(0),
+    num_rows_(0),
+    default_page_len_(default_page_len),
+    max_page_len_(max_page_len),
+    has_nullable_tuple_(row_desc->IsAnyTupleNullable()),
+    delete_on_read_(false),
+    closed_(false),
+    pinned_(true) {
+  DCHECK_GE(max_page_len, default_page_len);
+  DCHECK(BitUtil::IsPowerOf2(default_page_len)) << default_page_len;
+  DCHECK(BitUtil::IsPowerOf2(max_page_len)) << max_page_len;
+  read_page_ = pages_.end();
+  for (int i = 0; i < desc_->tuple_descriptors().size(); ++i) {
+    const TupleDescriptor* tuple_desc = desc_->tuple_descriptors()[i];
+    const int tuple_byte_size = tuple_desc->byte_size();
+    fixed_tuple_sizes_.push_back(tuple_byte_size);
+
+    vector<SlotDescriptor*> tuple_string_slots;
+    vector<SlotDescriptor*> tuple_coll_slots;
+    for (int j = 0; j < tuple_desc->slots().size(); ++j) {
+      SlotDescriptor* slot = tuple_desc->slots()[j];
+      if (!slot->type().IsVarLenType()) continue;
+      if (ext_varlen_slots.find(slot->id()) == ext_varlen_slots.end()) {
+        if (slot->type().IsVarLenStringType()) {
+          tuple_string_slots.push_back(slot);
+        } else {
+          DCHECK(slot->type().IsCollectionType());
+          tuple_coll_slots.push_back(slot);
+        }
+      }
+    }
+    if (!tuple_string_slots.empty()) {
+      inlined_string_slots_.push_back(make_pair(i, tuple_string_slots));
+    }
+
+    if (!tuple_coll_slots.empty()) {
+      inlined_coll_slots_.push_back(make_pair(i, tuple_coll_slots));
+    }
+  }
+}
+
+BufferedTupleStream::~BufferedTupleStream() {
+  DCHECK(closed_);
+}
+
+void BufferedTupleStream::CheckConsistencyFull() const {
+  CheckConsistencyFast();
+  // The below checks require iterating over all the pages in the stream.
+  DCHECK_EQ(bytes_pinned_, CalcBytesPinned()) << DebugString();
+  DCHECK_EQ(pages_.size(), num_pages_) << DebugString();
+  for (const Page& page : pages_) CheckPageConsistency(&page);
+}
+
+void BufferedTupleStream::CheckConsistencyFast() const {
+  // All the below checks should be O(1).
+  DCHECK(has_write_iterator() || write_page_ == nullptr);
+  if (write_page_ != nullptr) {
+    CheckPageConsistency(write_page_);
+    DCHECK(write_page_->is_pinned());
+    DCHECK(write_page_->retrieved_buffer);
+    const BufferHandle* write_buffer;
+    Status status = write_page_->GetBuffer(&write_buffer);
+    DCHECK(status.ok()); // Write buffer should never have been unpinned.
+    DCHECK_GE(write_ptr_, write_buffer->data());
+    DCHECK_EQ(write_end_ptr_, write_buffer->data() + write_page_->len());
+    DCHECK_GE(write_end_ptr_, write_ptr_);
+  }
+  DCHECK(has_read_iterator() || read_page_ == pages_.end());
+  if (read_page_ != pages_.end()) {
+    CheckPageConsistency(&*read_page_);
+    DCHECK(read_page_->is_pinned());
+    DCHECK(read_page_->retrieved_buffer);
+    // Can't check read buffer without affecting behaviour, because a read may be in
+    // flight and this would required blocking on that write.
+    DCHECK_GE(read_end_ptr_, read_ptr_);
+  }
+  if (NeedReadReservation()) {
+    DCHECK_EQ(default_page_len_, read_page_reservation_.GetReservation())
+        << DebugString();
+  } else if (!read_page_reservation_.is_closed()) {
+    DCHECK_EQ(0, read_page_reservation_.GetReservation());
+  }
+  if (NeedWriteReservation()) {
+    DCHECK_EQ(default_page_len_, write_page_reservation_.GetReservation());
+  } else if (!write_page_reservation_.is_closed()) {
+    DCHECK_EQ(0, write_page_reservation_.GetReservation());
+  }
+}
+
+void BufferedTupleStream::CheckPageConsistency(const Page* page) const {
+  DCHECK_EQ(ExpectedPinCount(pinned_, page), page->pin_count()) << DebugString();
+  // Only one large row per page.
+  if (page->len() > default_page_len_) DCHECK_LE(page->num_rows, 1);
+  // We only create pages when we have a row to append to them.
+  DCHECK_GT(page->num_rows, 0);
+}
+
+string BufferedTupleStream::DebugString() const {
+  stringstream ss;
+  ss << "BufferedTupleStream num_rows=" << num_rows_
+     << " rows_returned=" << rows_returned_ << " pinned=" << pinned_
+     << " delete_on_read=" << delete_on_read_ << " closed=" << closed_ << "\n"
+     << " bytes_pinned=" << bytes_pinned_ << " has_write_iterator=" << has_write_iterator_
+     << " write_page=" << write_page_ << " has_read_iterator=" << has_read_iterator_
+     << " read_page=";
+  if (read_page_ == pages_.end()) {
+    ss << "<end>";
+  } else {
+    ss << &*read_page_;
+  }
+  ss << "\n"
+     << " read_page_reservation=";
+  if (read_page_reservation_.is_closed()) {
+    ss << "<closed>";
+  } else {
+    ss << read_page_reservation_.GetReservation();
+  }
+  ss << " write_page_reservation=";
+  if (write_page_reservation_.is_closed()) {
+    ss << "<closed>";
+  } else {
+    ss << write_page_reservation_.GetReservation();
+  }
+  ss << "\n # pages=" << num_pages_ << " pages=[\n";
+  for (const Page& page : pages_) {
+    ss << "{" << page.DebugString() << "}";
+    if (&page != &pages_.back()) ss << ",\n";
+  }
+  ss << "]";
+  return ss.str();
+}
+
+string BufferedTupleStream::Page::DebugString() const {
+  return Substitute("$0 num_rows=$1", handle.DebugString(), num_rows);
+}
+
+Status BufferedTupleStream::Init(int node_id, bool pinned) {
+  if (!pinned) UnpinStream(UNPIN_ALL_EXCEPT_CURRENT);
+  node_id_ = node_id;
+  return Status::OK();
+}
+
+Status BufferedTupleStream::PrepareForWrite(bool* got_reservation) {
+  // This must be the first iterator created.
+  DCHECK(pages_.empty());
+  DCHECK(!delete_on_read_);
+  DCHECK(!has_write_iterator());
+  DCHECK(!has_read_iterator());
+  CHECK_CONSISTENCY_FULL();
+
+  *got_reservation = buffer_pool_client_->IncreaseReservationToFit(default_page_len_);
+  if (!*got_reservation) return Status::OK();
+  has_write_iterator_ = true;
+  // Save reservation for the write iterators.
+  buffer_pool_client_->SaveReservation(&write_page_reservation_, default_page_len_);
+  CHECK_CONSISTENCY_FULL();
+  return Status::OK();
+}
+
+Status BufferedTupleStream::PrepareForReadWrite(
+    bool delete_on_read, bool* got_reservation) {
+  // This must be the first iterator created.
+  DCHECK(pages_.empty());
+  DCHECK(!delete_on_read_);
+  DCHECK(!has_write_iterator());
+  DCHECK(!has_read_iterator());
+  CHECK_CONSISTENCY_FULL();
+
+  *got_reservation = buffer_pool_client_->IncreaseReservationToFit(2 * default_page_len_);
+  if (!*got_reservation) return Status::OK();
+  has_write_iterator_ = true;
+  // Save reservation for both the read and write iterators.
+  buffer_pool_client_->SaveReservation(&read_page_reservation_, default_page_len_);
+  buffer_pool_client_->SaveReservation(&write_page_reservation_, default_page_len_);
+  RETURN_IF_ERROR(PrepareForReadInternal(delete_on_read));
+  return Status::OK();
+}
+
+void BufferedTupleStream::Close(RowBatch* batch, RowBatch::FlushMode flush) {
+  for (Page& page : pages_) {
+    if (batch != nullptr && page.retrieved_buffer) {
+      // Subtle: We only need to attach buffers from pages that we may have returned
+      // references to. ExtractBuffer() cannot fail for these pages because the data
+      // is guaranteed to already be in -memory.
+      BufferPool::BufferHandle buffer;
+      Status status = buffer_pool_->ExtractBuffer(buffer_pool_client_, &page.handle, &buffer);
+      DCHECK(status.ok());
+      batch->AddBuffer(buffer_pool_client_, move(buffer), flush);
+    } else {
+      buffer_pool_->DestroyPage(buffer_pool_client_, &page.handle);
+    }
+  }
+  read_page_reservation_.Close();
+  write_page_reservation_.Close();
+  pages_.clear();
+  num_pages_ = 0;
+  bytes_pinned_ = 0;
+  closed_ = true;
+}
+
+int64_t BufferedTupleStream::CalcBytesPinned() const {
+  int64_t result = 0;
+  for (const Page& page : pages_) result += page.pin_count() * page.len();
+  return result;
+}
+
+Status BufferedTupleStream::PinPage(Page* page) {
+  RETURN_IF_ERROR(buffer_pool_->Pin(buffer_pool_client_, &page->handle));
+  bytes_pinned_ += page->len();
+  return Status::OK();
+}
+
+int BufferedTupleStream::ExpectedPinCount(bool stream_pinned, const Page* page) const {
+  return (stream_pinned || is_read_page(page) || is_write_page(page)) ? 1 : 0;
+}
+
+Status BufferedTupleStream::PinPageIfNeeded(Page* page, bool stream_pinned) {
+  int new_pin_count = ExpectedPinCount(stream_pinned, page);
+  if (new_pin_count != page->pin_count()) {
+    DCHECK_EQ(new_pin_count, page->pin_count() + 1);
+    RETURN_IF_ERROR(PinPage(page));
+  }
+  return Status::OK();
+}
+
+void BufferedTupleStream::UnpinPageIfNeeded(Page* page, bool stream_pinned) {
+  int new_pin_count = ExpectedPinCount(stream_pinned, page);
+  if (new_pin_count != page->pin_count()) {
+    DCHECK_EQ(new_pin_count, page->pin_count() - 1);
+    buffer_pool_->Unpin(buffer_pool_client_, &page->handle);
+    bytes_pinned_ -= page->len();
+    if (page->pin_count() == 0) page->retrieved_buffer = false;
+  }
+}
+
+bool BufferedTupleStream::NeedWriteReservation() const {
+  return NeedWriteReservation(pinned_);
+}
+
+bool BufferedTupleStream::NeedWriteReservation(bool stream_pinned) const {
+  return NeedWriteReservation(stream_pinned, num_pages_, has_write_iterator(),
+      write_page_ != nullptr, has_read_write_page());
+}
+
+bool BufferedTupleStream::NeedWriteReservation(bool stream_pinned, int64_t num_pages,
+    bool has_write_iterator, bool has_write_page, bool has_read_write_page) {
+  if (!has_write_iterator) return false;
+  // If the stream is empty the write reservation hasn't been used yet.
+  if (num_pages == 0) return true;
+  if (stream_pinned) {
+    // Make sure we've saved the write reservation for the next page if the only
+    // page is a read/write page.
+    return has_read_write_page && num_pages == 1;
+  } else {
+    // Make sure we've saved the write reservation if it's not being used to pin
+    // a page in the stream.
+    return !has_write_page || has_read_write_page;
+  }
+}
+
+bool BufferedTupleStream::NeedReadReservation() const {
+  return NeedReadReservation(pinned_);
+}
+
+bool BufferedTupleStream::NeedReadReservation(bool stream_pinned) const {
+  return NeedReadReservation(
+      stream_pinned, num_pages_, has_read_iterator(), read_page_ != pages_.end());
+}
+
+bool BufferedTupleStream::NeedReadReservation(bool stream_pinned, int64_t num_pages,
+    bool has_read_iterator, bool has_read_page) const {
+  return NeedReadReservation(stream_pinned, num_pages, has_read_iterator, has_read_page,
+      has_write_iterator(), write_page_ != nullptr);
+}
+
+bool BufferedTupleStream::NeedReadReservation(bool stream_pinned, int64_t num_pages,
+    bool has_read_iterator, bool has_read_page, bool has_write_iterator,
+    bool has_write_page) {
+  if (!has_read_iterator) return false;
+  if (stream_pinned) {
+    // Need reservation if there are no pages currently pinned for reading but we may add
+    // a page.
+    return num_pages == 0 && has_write_iterator;
+  } else {
+    // Only need to save reservation for an unpinned stream if there is no read page
+    // and we may advance to one in the future.
+    return (has_write_iterator || num_pages > 0) && !has_read_page;
+  }
+}
+
+Status BufferedTupleStream::NewWritePage(int64_t page_len) noexcept {
+  DCHECK(!closed_);
+  DCHECK(write_page_ == nullptr);
+
+  Page new_page;
+  const BufferHandle* write_buffer;
+  RETURN_IF_ERROR(buffer_pool_->CreatePage(
+      buffer_pool_client_, page_len, &new_page.handle, &write_buffer));
+  bytes_pinned_ += page_len;
+  total_byte_size_ += page_len;
+
+  pages_.push_back(std::move(new_page));
+  ++num_pages_;
+  write_page_ = &pages_.back();
+  DCHECK_EQ(write_page_->num_rows, 0);
+  write_ptr_ = write_buffer->data();
+  write_end_ptr_ = write_ptr_ + page_len;
+  return Status::OK();
+}
+
+Status BufferedTupleStream::CalcPageLenForRow(int64_t row_size, int64_t* page_len) {
+  if (UNLIKELY(row_size > max_page_len_)) {
+    return Status(TErrorCode::MAX_ROW_SIZE,
+        PrettyPrinter::Print(row_size, TUnit::BYTES), node_id_,
+        PrettyPrinter::Print(0, TUnit::BYTES));
+  }
+  *page_len = max(default_page_len_, BitUtil::RoundUpToPowerOfTwo(row_size));
+  return Status::OK();
+}
+
+Status BufferedTupleStream::AdvanceWritePage(
+    int64_t row_size, bool* got_reservation) noexcept {
+  DCHECK(has_write_iterator());
+  CHECK_CONSISTENCY_FAST();
+
+  int64_t page_len;
+  RETURN_IF_ERROR(CalcPageLenForRow(row_size, &page_len));
+
+  // Reservation may have been saved for the next write page, e.g. by PrepareForWrite()
+  // if the stream is empty.
+  int64_t write_reservation_to_restore = 0, read_reservation_to_restore = 0;
+  if (NeedWriteReservation(
+          pinned_, num_pages_, true, write_page_ != nullptr, has_read_write_page())
+      && !NeedWriteReservation(pinned_, num_pages_ + 1, true, true, false)) {
+    write_reservation_to_restore = default_page_len_;
+  }
+  // If the stream is pinned, we need to keep the previous write page pinned for reading.
+  // Check if we saved reservation for this case.
+  if (NeedReadReservation(pinned_, num_pages_, has_read_iterator(),
+          read_page_ != pages_.end(), true, write_page_ != nullptr)
+      && !NeedReadReservation(pinned_, num_pages_ + 1, has_read_iterator(),
+             read_page_ != pages_.end(), true, true)) {
+    read_reservation_to_restore = default_page_len_;
+  }
+
+  // We may reclaim reservation by unpinning a page that was pinned for writing.
+  int64_t write_page_reservation_to_reclaim =
+      (write_page_ != nullptr && !pinned_ && !has_read_write_page()) ?
+      write_page_->len() : 0;
+  // Check to see if we can get the reservation before changing the state of the stream.
+  if (!buffer_pool_client_->IncreaseReservationToFit(page_len
+          - write_reservation_to_restore - read_reservation_to_restore
+          - write_page_reservation_to_reclaim)) {
+    DCHECK(pinned_ || page_len > default_page_len_)
+        << "If the stream is unpinned, this should only fail for large pages";
+    CHECK_CONSISTENCY_FAST();
+    *got_reservation = false;
+    return Status::OK();
+  }
+  if (write_reservation_to_restore > 0) {
+    buffer_pool_client_->RestoreReservation(
+        &write_page_reservation_, write_reservation_to_restore);
+  }
+  if (read_reservation_to_restore > 0) {
+    buffer_pool_client_->RestoreReservation(
+        &read_page_reservation_, read_reservation_to_restore);
+  }
+  ResetWritePage();
+  RETURN_IF_ERROR(NewWritePage(page_len));
+  *got_reservation = true;
+  return Status::OK();
+}
+
+void BufferedTupleStream::ResetWritePage() {
+  if (write_page_ == nullptr) return;
+  // Unpin the write page if we're reading in unpinned mode.
+  Page* prev_write_page = write_page_;
+  write_page_ = nullptr;
+  write_ptr_ = nullptr;
+  write_end_ptr_ = nullptr;
+
+  // May need to decrement pin count now that it's not the write page, depending on
+  // the stream's mode.
+  UnpinPageIfNeeded(prev_write_page, pinned_);
+}
+
+void BufferedTupleStream::InvalidateWriteIterator() {
+  if (!has_write_iterator()) return;
+  ResetWritePage();
+  has_write_iterator_ = false;
+  // No more pages will be appended to stream - do not need any write reservation.
+  write_page_reservation_.Close();
+  // May not need a read reservation once the write iterator is invalidated.
+  if (NeedReadReservation(pinned_, num_pages_, has_read_iterator(),
+          read_page_ != pages_.end(), true, write_page_ != nullptr)
+      && !NeedReadReservation(pinned_, num_pages_, has_read_iterator(),
+             read_page_ != pages_.end(), false, false)) {
+    buffer_pool_client_->RestoreReservation(&read_page_reservation_, default_page_len_);
+  }
+}
+
+Status BufferedTupleStream::NextReadPage() {
+  DCHECK(has_read_iterator());
+  DCHECK(!closed_);
+  CHECK_CONSISTENCY_FAST();
+
+  if (read_page_ == pages_.end()) {
+    // No rows read yet - start reading at first page. If the stream is unpinned, we can
+    // use the reservation saved in PrepareForReadWrite() to pin the first page.
+    read_page_ = pages_.begin();
+    if (NeedReadReservation(pinned_, num_pages_, true, false)
+        && !NeedReadReservation(pinned_, num_pages_, true, true)) {
+      buffer_pool_client_->RestoreReservation(&read_page_reservation_, default_page_len_);
+    }
+  } else if (delete_on_read_) {
+    DCHECK(read_page_ == pages_.begin()) << read_page_->DebugString() << " "
+                                         << DebugString();
+    DCHECK_NE(&*read_page_, write_page_);
+    bytes_pinned_ -= pages_.front().len();
+    buffer_pool_->DestroyPage(buffer_pool_client_, &pages_.front().handle);
+    pages_.pop_front();
+    --num_pages_;
+    read_page_ = pages_.begin();
+  } else {
+    // Unpin pages after reading them if needed.
+    Page* prev_read_page = &*read_page_;
+    ++read_page_;
+    UnpinPageIfNeeded(prev_read_page, pinned_);
+  }
+
+  if (read_page_ == pages_.end()) {
+    CHECK_CONSISTENCY_FULL();
+    return Status::OK();
+  }
+
+  if (!pinned_ && read_page_->len() > default_page_len_
+      && buffer_pool_client_->GetUnusedReservation() < read_page_->len()) {
+    // If we are iterating over an unpinned stream and encounter a page that is larger
+    // than the default page length, then unpinning the previous page may not have
+    // freed up enough reservation to pin the next one. The client is responsible for
+    // ensuring the reservation is available, so this indicates a bug.
+    return Status(TErrorCode::INTERNAL_ERROR, Substitute("Internal error: couldn't pin "
+          "large page of $0 bytes, client only had $1 bytes of unused reservation:\n$2",
+          read_page_->len(), buffer_pool_client_->GetUnusedReservation(),
+          buffer_pool_client_->DebugString()));
+  }
+  // Ensure the next page is pinned for reading. By this point we should have enough
+  // reservation to pin the page. If the stream is pinned, the page is already pinned.
+  // If the stream is unpinned, we freed up enough memory for a default-sized page by
+  // deleting or unpinning the previous page and ensured that, if the page was larger,
+  // that the reservation is available with the above check.
+  RETURN_IF_ERROR(PinPageIfNeeded(&*read_page_, pinned_));
+
+  // This waits for the pin to complete if the page was unpinned earlier.
+  const BufferHandle* read_buffer;
+  RETURN_IF_ERROR(read_page_->GetBuffer(&read_buffer));
+
+  read_page_rows_returned_ = 0;
+  read_ptr_ = read_buffer->data();
+  read_end_ptr_ = read_ptr_ + read_buffer->len();
+
+  // We may need to save reservation for the write page in the case when the write page
+  // became a read/write page.
+  if (!NeedWriteReservation(pinned_, num_pages_, has_write_iterator(),
+             write_page_ != nullptr, false)
+      && NeedWriteReservation(pinned_, num_pages_, has_write_iterator(),
+             write_page_ != nullptr, has_read_write_page())) {
+    buffer_pool_client_->SaveReservation(&write_page_reservation_, default_page_len_);
+  }
+  CHECK_CONSISTENCY_FAST();
+  return Status::OK();
+}
+
+void BufferedTupleStream::InvalidateReadIterator() {
+  if (read_page_ != pages_.end()) {
+    // Unpin the write page if we're reading in unpinned mode.
+    Page* prev_read_page = &*read_page_;
+    read_page_ = pages_.end();
+    read_ptr_ = nullptr;
+    read_end_ptr_ = nullptr;
+
+    // May need to decrement pin count after destroying read iterator.
+    UnpinPageIfNeeded(prev_read_page, pinned_);
+  }
+  has_read_iterator_ = false;
+  if (read_page_reservation_.GetReservation() > 0) {
+    buffer_pool_client_->RestoreReservation(&read_page_reservation_, default_page_len_);
+  }
+  // It is safe to re-read a delete-on-read stream if no rows were read and no pages
+  // were therefore deleted.
+  if (rows_returned_ == 0) delete_on_read_ = false;
+}
+
+Status BufferedTupleStream::PrepareForRead(bool delete_on_read, bool* got_reservation) {
+  CHECK_CONSISTENCY_FULL();
+  InvalidateWriteIterator();
+  InvalidateReadIterator();
+  // If already pinned, no additional pin is needed (see ExpectedPinCount()).
+  *got_reservation = pinned_ || pages_.empty()
+      || buffer_pool_client_->IncreaseReservationToFit(default_page_len_);
+  if (!*got_reservation) return Status::OK();
+  return PrepareForReadInternal(delete_on_read);
+}
+
+Status BufferedTupleStream::PrepareForReadInternal(bool delete_on_read) {
+  DCHECK(!closed_);
+  DCHECK(!delete_on_read_);
+  DCHECK(!has_read_iterator());
+
+  has_read_iterator_ = true;
+  if (pages_.empty()) {
+    // No rows to return, or a the first read/write page has not yet been allocated.
+    read_page_ = pages_.end();
+    read_ptr_ = nullptr;
+    read_end_ptr_ = nullptr;
+  } else {
+    // Eagerly pin the first page in the stream.
+    read_page_ = pages_.begin();
+    // Check if we need to increment the pin count of the read page.
+    RETURN_IF_ERROR(PinPageIfNeeded(&*read_page_, pinned_));
+    DCHECK(read_page_->is_pinned());
+
+    // This waits for the pin to complete if the page was unpinned earlier.
+    const BufferHandle* read_buffer;
+    RETURN_IF_ERROR(read_page_->GetBuffer(&read_buffer));
+    read_ptr_ = read_buffer->data();
+    read_end_ptr_ = read_ptr_ + read_buffer->len();
+  }
+  read_page_rows_returned_ = 0;
+  rows_returned_ = 0;
+  delete_on_read_ = delete_on_read;
+  CHECK_CONSISTENCY_FULL();
+  return Status::OK();
+}
+
+Status BufferedTupleStream::PinStream(bool* pinned) {
+  DCHECK(!closed_);
+  CHECK_CONSISTENCY_FULL();
+  if (pinned_) {
+    *pinned = true;
+    return Status::OK();
+  }
+  *pinned = false;
+  // First, make sure we have the reservation to pin all the pages for reading.
+  int64_t bytes_to_pin = 0;
+  for (Page& page : pages_) {
+    bytes_to_pin += (ExpectedPinCount(true, &page) - page.pin_count()) * page.len();
+  }
+
+  // Check if we have some reservation to restore.
+  bool restore_write_reservation =
+      NeedWriteReservation(false) && !NeedWriteReservation(true);
+  bool restore_read_reservation =
+      NeedReadReservation(false) && !NeedReadReservation(true);
+  int64_t increase_needed = bytes_to_pin
+      - (restore_write_reservation ? default_page_len_ : 0)
+      - (restore_read_reservation ? default_page_len_ : 0);
+  bool reservation_granted =
+      buffer_pool_client_->IncreaseReservationToFit(increase_needed);
+  if (!reservation_granted) return Status::OK();
+
+  // If there is no current write page we should have some saved reservation to use.
+  // Only continue saving it if the stream is empty and need it to pin the first page.
+  if (restore_write_reservation) {
+    buffer_pool_client_->RestoreReservation(&write_page_reservation_, default_page_len_);
+  }
+  if (restore_read_reservation) {
+    buffer_pool_client_->RestoreReservation(&read_page_reservation_, default_page_len_);
+  }
+
+  // At this point success is guaranteed - go through to pin the pages we need to pin.
+  // If the page data was evicted from memory, the read I/O can happen in parallel
+  // because we defer calling GetBuffer() until NextReadPage().
+  for (Page& page : pages_) RETURN_IF_ERROR(PinPageIfNeeded(&page, true));
+
+  pinned_ = true;
+  *pinned = true;
+  CHECK_CONSISTENCY_FULL();
+  return Status::OK();
+}
+
+void BufferedTupleStream::UnpinStream(UnpinMode mode) {
+  CHECK_CONSISTENCY_FULL();
+  DCHECK(!closed_);
+  if (mode == UNPIN_ALL) {
+    // Invalidate the iterators so they don't keep pages pinned.
+    InvalidateWriteIterator();
+    InvalidateReadIterator();
+  }
+
+  if (pinned_) {
+    CHECK_CONSISTENCY_FULL();
+    // If the stream was pinned, there may be some remaining pinned pages that should
+    // be unpinned at this point.
+    for (Page& page : pages_) UnpinPageIfNeeded(&page, false);
+
+    // Check to see if we need to save some of the reservation we freed up.
+    if (!NeedWriteReservation(true) && NeedWriteReservation(false)) {
+      buffer_pool_client_->SaveReservation(&write_page_reservation_, default_page_len_);
+    }
+    if (!NeedReadReservation(true) && NeedReadReservation(false)) {
+      buffer_pool_client_->SaveReservation(&read_page_reservation_, default_page_len_);
+    }
+    pinned_ = false;
+  }
+  CHECK_CONSISTENCY_FULL();
+}
+
+Status BufferedTupleStream::GetRows(
+    MemTracker* tracker, scoped_ptr<RowBatch>* batch, bool* got_rows) {
+  if (num_rows() > numeric_limits<int>::max()) {
+    // RowBatch::num_rows_ is a 32-bit int, avoid an overflow.
+    return Status(Substitute("Trying to read $0 rows into in-memory batch failed. Limit "
+                             "is $1",
+        num_rows(), numeric_limits<int>::max()));
+  }
+  RETURN_IF_ERROR(PinStream(got_rows));
+  if (!*got_rows) return Status::OK();
+  bool got_reservation;
+  RETURN_IF_ERROR(PrepareForRead(false, &got_reservation));
+  DCHECK(got_reservation) << "Stream was pinned";
+  batch->reset(new RowBatch(desc_, num_rows(), tracker));
+  bool eos = false;
+  // Loop until GetNext fills the entire batch. Each call can stop at page
+  // boundaries. We generally want it to stop, so that pages can be freed
+  // as we read. It is safe in this case because we pin the entire stream.
+  while (!eos) {
+    RETURN_IF_ERROR(GetNext(batch->get(), &eos));
+  }
+  return Status::OK();
+}
+
+Status BufferedTupleStream::GetNext(RowBatch* batch, bool* eos) {
+  return GetNextInternal<false>(batch, eos, nullptr);
+}
+
+Status BufferedTupleStream::GetNext(
+    RowBatch* batch, bool* eos, vector<FlatRowPtr>* flat_rows) {
+  return GetNextInternal<true>(batch, eos, flat_rows);
+}
+
+template <bool FILL_FLAT_ROWS>
+Status BufferedTupleStream::GetNextInternal(
+    RowBatch* batch, bool* eos, vector<FlatRowPtr>* flat_rows) {
+  if (has_nullable_tuple_) {
+    return GetNextInternal<FILL_FLAT_ROWS, true>(batch, eos, flat_rows);
+  } else {
+    return GetNextInternal<FILL_FLAT_ROWS, false>(batch, eos, flat_rows);
+  }
+}
+
+template <bool FILL_FLAT_ROWS, bool HAS_NULLABLE_TUPLE>
+Status BufferedTupleStream::GetNextInternal(
+    RowBatch* batch, bool* eos, vector<FlatRowPtr>* flat_rows) {
+  DCHECK(!closed_);
+  DCHECK(batch->row_desc()->Equals(*desc_));
+  DCHECK(is_pinned() || !FILL_FLAT_ROWS)
+      << "FlatRowPtrs are only valid for pinned streams";
+  *eos = (rows_returned_ == num_rows_);
+  if (*eos) return Status::OK();
+
+  if (UNLIKELY(read_page_ == pages_.end()
+          || read_page_rows_returned_ == read_page_->num_rows)) {
+    // Get the next page in the stream (or the first page if read_page_ was not yet
+    // initialized.) We need to do this at the beginning of the GetNext() call to ensure
+    // the buffer management semantics. NextReadPage() may unpin or delete the buffer
+    // backing the rows returned from the *previous* call to GetNext().
+    RETURN_IF_ERROR(NextReadPage());
+  }
+
+  DCHECK(has_read_iterator());
+  DCHECK(read_page_ != pages_.end());
+  DCHECK(read_page_->is_pinned()) << DebugString();
+  DCHECK_GE(read_page_rows_returned_, 0);
+
+  int rows_left_in_page = read_page_->num_rows - read_page_rows_returned_;
+  int rows_to_fill = std::min(batch->capacity() - batch->num_rows(), rows_left_in_page);
+  DCHECK_GE(rows_to_fill, 1);
+  uint8_t* tuple_row_mem = reinterpret_cast<uint8_t*>(batch->GetRow(batch->num_rows()));
+
+  // Produce tuple rows from the current page and the corresponding position on the
+  // null tuple indicator.
+  if (FILL_FLAT_ROWS) {
+    DCHECK(flat_rows != nullptr);
+    DCHECK(!delete_on_read_);
+    DCHECK_EQ(batch->num_rows(), 0);
+    flat_rows->clear();
+    flat_rows->reserve(rows_to_fill);
+  }
+
+  const uint64_t tuples_per_row = desc_->tuple_descriptors().size();
+  // Start reading from the current position in 'read_page_'.
+  for (int i = 0; i < rows_to_fill; ++i) {
+    if (FILL_FLAT_ROWS) {
+      flat_rows->push_back(read_ptr_);
+      DCHECK_EQ(flat_rows->size(), i + 1);
+    }
+    // Copy the row into the output batch.
+    TupleRow* output_row = reinterpret_cast<TupleRow*>(tuple_row_mem);
+    tuple_row_mem += sizeof(Tuple*) * tuples_per_row;
+    UnflattenTupleRow<HAS_NULLABLE_TUPLE>(&read_ptr_, output_row);
+
+    // Update string slot ptrs, skipping external strings.
+    for (int j = 0; j < inlined_string_slots_.size(); ++j) {
+      Tuple* tuple = output_row->GetTuple(inlined_string_slots_[j].first);
+      if (HAS_NULLABLE_TUPLE && tuple == nullptr) continue;
+      FixUpStringsForRead(inlined_string_slots_[j].second, tuple);
+    }
+
+    // Update collection slot ptrs, skipping external collections. We traverse the
+    // collection structure in the same order as it was written to the stream, allowing
+    // us to infer the data layout based on the length of collections and strings.
+    for (int j = 0; j < inlined_coll_slots_.size(); ++j) {
+      Tuple* tuple = output_row->GetTuple(inlined_coll_slots_[j].first);
+      if (HAS_NULLABLE_TUPLE && tuple == nullptr) continue;
+      FixUpCollectionsForRead(inlined_coll_slots_[j].second, tuple);
+    }
+  }
+
+  batch->CommitRows(rows_to_fill);
+  rows_returned_ += rows_to_fill;
+  read_page_rows_returned_ += rows_to_fill;
+  *eos = (rows_returned_ == num_rows_);
+  if (read_page_rows_returned_ == read_page_->num_rows && (!pinned_ || delete_on_read_)) {
+    // No more data in this page. The batch must be immediately returned up the operator
+    // tree and deep copied so that NextReadPage() can reuse the read page's buffer.
+    // TODO: IMPALA-4179 - instead attach the buffer and flush the resources.
+    batch->MarkNeedsDeepCopy();
+  }
+  if (FILL_FLAT_ROWS) DCHECK_EQ(flat_rows->size(), rows_to_fill);
+  DCHECK_LE(read_ptr_, read_end_ptr_);
+  return Status::OK();
+}
+
+void BufferedTupleStream::FixUpStringsForRead(
+    const vector<SlotDescriptor*>& string_slots, Tuple* tuple) {
+  DCHECK(tuple != nullptr);
+  for (const SlotDescriptor* slot_desc : string_slots) {
+    if (tuple->IsNull(slot_desc->null_indicator_offset())) continue;
+
+    StringValue* sv = tuple->GetStringSlot(slot_desc->tuple_offset());
+    DCHECK_LE(read_ptr_ + sv->len, read_end_ptr_);
+    sv->ptr = reinterpret_cast<char*>(read_ptr_);
+    read_ptr_ += sv->len;
+  }
+}
+
+void BufferedTupleStream::FixUpCollectionsForRead(
+    const vector<SlotDescriptor*>& collection_slots, Tuple* tuple) {
+  DCHECK(tuple != nullptr);
+  for (const SlotDescriptor* slot_desc : collection_slots) {
+    if (tuple->IsNull(slot_desc->null_indicator_offset())) continue;
+
+    CollectionValue* cv = tuple->GetCollectionSlot(slot_desc->tuple_offset());
+    const TupleDescriptor& item_desc = *slot_desc->collection_item_descriptor();
+    int coll_byte_size = cv->num_tuples * item_desc.byte_size();
+    DCHECK_LE(read_ptr_ + coll_byte_size, read_end_ptr_);
+    cv->ptr = reinterpret_cast<uint8_t*>(read_ptr_);
+    read_ptr_ += coll_byte_size;
+
+    if (!item_desc.HasVarlenSlots()) continue;
+    uint8_t* coll_data = cv->ptr;
+    for (int i = 0; i < cv->num_tuples; ++i) {
+      Tuple* item = reinterpret_cast<Tuple*>(coll_data);
+      FixUpStringsForRead(item_desc.string_slots(), item);
+      FixUpCollectionsForRead(item_desc.collection_slots(), item);
+      coll_data += item_desc.byte_size();
+    }
+  }
+}
+
+int64_t BufferedTupleStream::ComputeRowSize(TupleRow* row) const noexcept {
+  int64_t size = 0;
+  if (has_nullable_tuple_) {
+    size += NullIndicatorBytesPerRow();
+    for (int i = 0; i < fixed_tuple_sizes_.size(); ++i) {
+      if (row->GetTuple(i) != nullptr) size += fixed_tuple_sizes_[i];
+    }
+  } else {
+    for (int i = 0; i < fixed_tuple_sizes_.size(); ++i) {
+      size += fixed_tuple_sizes_[i];
+    }
+  }
+  for (int i = 0; i < inlined_string_slots_.size(); ++i) {
+    Tuple* tuple = row->GetTuple(inlined_string_slots_[i].first);
+    if (tuple == nullptr) continue;
+    const vector<SlotDescriptor*>& slots = inlined_string_slots_[i].second;
+    for (auto it = slots.begin(); it != slots.end(); ++it) {
+      if (tuple->IsNull((*it)->null_indicator_offset())) continue;
+      size += tuple->GetStringSlot((*it)->tuple_offset())->len;
+    }
+  }
+
+  for (int i = 0; i < inlined_coll_slots_.size(); ++i) {
+    Tuple* tuple = row->GetTuple(inlined_coll_slots_[i].first);
+    if (tuple == nullptr) continue;
+    const vector<SlotDescriptor*>& slots = inlined_coll_slots_[i].second;
+    for (auto it = slots.begin(); it != slots.end(); ++it) {
+      if (tuple->IsNull((*it)->null_indicator_offset())) continue;
+      CollectionValue* cv = tuple->GetCollectionSlot((*it)->tuple_offset());
+      const TupleDescriptor& item_desc = *(*it)->collection_item_descriptor();
+      size += cv->num_tuples * item_desc.byte_size();
+
+      if (!item_desc.HasVarlenSlots()) continue;
+      for (int j = 0; j < cv->num_tuples; ++j) {
+        Tuple* item = reinterpret_cast<Tuple*>(&cv->ptr[j * item_desc.byte_size()]);
+        size += item->VarlenByteSize(item_desc);
+      }
+    }
+  }
+  return size;
+}
+
+bool BufferedTupleStream::AddRowSlow(TupleRow* row, Status* status) noexcept {
+  // Use AddRowCustom*() to do the work of advancing the page.
+  int64_t row_size = ComputeRowSize(row);
+  uint8_t* data = AddRowCustomBeginSlow(row_size, status);
+  if (data == nullptr) return false;
+  bool success = DeepCopy(row, &data, data + row_size);
+  DCHECK(success);
+  DCHECK_EQ(data, write_ptr_);
+  AddRowCustomEnd(row_size);
+  return true;
+}
+
+uint8_t* BufferedTupleStream::AddRowCustomBeginSlow(
+    int64_t size, Status* status) noexcept {
+  bool got_reservation;
+  *status = AdvanceWritePage(size, &got_reservation);
+  if (!status->ok() || !got_reservation) return nullptr;
+
+  // We have a large-enough page so now success is guaranteed.
+  uint8_t* result = AddRowCustomBegin(size, status);
+  DCHECK(result != nullptr);
+  return result;
+}
+
+void BufferedTupleStream::AddLargeRowCustomEnd(int64_t size) noexcept {
+  DCHECK_GT(size, default_page_len_);
+  // Immediately unpin the large write page so that we're not using up extra reservation
+  // and so we don't append another row to the page.
+  ResetWritePage();
+  // Save some of the reservation we freed up so we can create the next write page when
+  // needed.
+  if (NeedWriteReservation()) {
+    buffer_pool_client_->SaveReservation(&write_page_reservation_, default_page_len_);
+  }
+  // The stream should be in a consistent state once the row is added.
+  CHECK_CONSISTENCY_FAST();
+}
+
+bool BufferedTupleStream::AddRow(TupleRow* row, Status* status) noexcept {
+  DCHECK(!closed_);
+  DCHECK(has_write_iterator());
+  if (UNLIKELY(write_page_ == nullptr || !DeepCopy(row, &write_ptr_, write_end_ptr_))) {
+    return AddRowSlow(row, status);
+  }
+  ++num_rows_;
+  ++write_page_->num_rows;
+  return true;
+}
+
+bool BufferedTupleStream::DeepCopy(
+    TupleRow* row, uint8_t** data, const uint8_t* data_end) noexcept {
+  return has_nullable_tuple_ ? DeepCopyInternal<true>(row, data, data_end) :
+                               DeepCopyInternal<false>(row, data, data_end);
+}
+
+// TODO: consider codegening this.
+// TODO: in case of duplicate tuples, this can redundantly serialize data.
+template <bool HAS_NULLABLE_TUPLE>
+bool BufferedTupleStream::DeepCopyInternal(
+    TupleRow* row, uint8_t** data, const uint8_t* data_end) noexcept {
+  uint8_t* pos = *data;
+  const uint64_t tuples_per_row = desc_->tuple_descriptors().size();
+  // Copy the not NULL fixed len tuples. For the NULL tuples just update the NULL tuple
+  // indicator.
+  if (HAS_NULLABLE_TUPLE) {
+    int null_indicator_bytes = NullIndicatorBytesPerRow();
+    if (UNLIKELY(pos + null_indicator_bytes > data_end)) return false;
+    uint8_t* null_indicators = pos;
+    pos += NullIndicatorBytesPerRow();
+    memset(null_indicators, 0, null_indicator_bytes);
+    for (int i = 0; i < tuples_per_row; ++i) {
+      uint8_t* null_word = null_indicators + (i >> 3);
+      const uint32_t null_pos = i & 7;
+      const int tuple_size = fixed_tuple_sizes_[i];
+      Tuple* t = row->GetTuple(i);
+      const uint8_t mask = 1 << (7 - null_pos);
+      if (t != nullptr) {
+        if (UNLIKELY(pos + tuple_size > data_end)) return false;
+        memcpy(pos, t, tuple_size);
+        pos += tuple_size;
+      } else {
+        *null_word |= mask;
+      }
+    }
+  } else {
+    // If we know that there are no nullable tuples no need to set the nullability flags.
+    for (int i = 0; i < tuples_per_row; ++i) {
+      const int tuple_size = fixed_tuple_sizes_[i];
+      if (UNLIKELY(pos + tuple_size > data_end)) return false;
+      Tuple* t = row->GetTuple(i);
+      // TODO: Once IMPALA-1306 (Avoid passing empty tuples of non-materialized slots)
+      // is delivered, the check below should become DCHECK(t != nullptr).
+      DCHECK(t != nullptr || tuple_size == 0);
+      memcpy(pos, t, tuple_size);
+      pos += tuple_size;
+    }
+  }
+
+  // Copy inlined string slots. Note: we do not need to convert the string ptrs to offsets
+  // on the write path, only on the read. The tuple data is immediately followed
+  // by the string data so only the len information is necessary.
+  for (int i = 0; i < inlined_string_slots_.size(); ++i) {
+    const Tuple* tuple = row->GetTuple(inlined_string_slots_[i].first);
+    if (HAS_NULLABLE_TUPLE && tuple == nullptr) continue;
+    if (UNLIKELY(!CopyStrings(tuple, inlined_string_slots_[i].second, &pos, data_end)))
+      return false;
+  }
+
+  // Copy inlined collection slots. We copy collection data in a well-defined order so
+  // we do not need to convert pointers to offsets on the write path.
+  for (int i = 0; i < inlined_coll_slots_.size(); ++i) {
+    const Tuple* tuple = row->GetTuple(inlined_coll_slots_[i].first);
+    if (HAS_NULLABLE_TUPLE && tuple == nullptr) continue;
+    if (UNLIKELY(!CopyCollections(tuple, inlined_coll_slots_[i].second, &pos, data_end)))
+      return false;
+  }
+  *data = pos;
+  return true;
+}
+
+bool BufferedTupleStream::CopyStrings(const Tuple* tuple,
+    const vector<SlotDescriptor*>& string_slots, uint8_t** data, const uint8_t* data_end) {
+  for (const SlotDescriptor* slot_desc : string_slots) {
+    if (tuple->IsNull(slot_desc->null_indicator_offset())) continue;
+    const StringValue* sv = tuple->GetStringSlot(slot_desc->tuple_offset());
+    if (LIKELY(sv->len > 0)) {
+      if (UNLIKELY(*data + sv->len > data_end)) return false;
+
+      memcpy(*data, sv->ptr, sv->len);
+      *data += sv->len;
+    }
+  }
+  return true;
+}
+
+bool BufferedTupleStream::CopyCollections(const Tuple* tuple,
+    const vector<SlotDescriptor*>& collection_slots, uint8_t** data, const uint8_t* data_end) {
+  for (const SlotDescriptor* slot_desc : collection_slots) {
+    if (tuple->IsNull(slot_desc->null_indicator_offset())) continue;
+    const CollectionValue* cv = tuple->GetCollectionSlot(slot_desc->tuple_offset());
+    const TupleDescriptor& item_desc = *slot_desc->collection_item_descriptor();
+    if (LIKELY(cv->num_tuples > 0)) {
+      int coll_byte_size = cv->num_tuples * item_desc.byte_size();
+      if (UNLIKELY(*data + coll_byte_size > data_end)) return false;
+      uint8_t* coll_data = *data;
+      memcpy(coll_data, cv->ptr, coll_byte_size);
+      *data += coll_byte_size;
+
+      if (!item_desc.HasVarlenSlots()) continue;
+      // Copy variable length data when present in collection items.
+      for (int i = 0; i < cv->num_tuples; ++i) {
+        const Tuple* item = reinterpret_cast<Tuple*>(coll_data);
+        if (UNLIKELY(!CopyStrings(item, item_desc.string_slots(), data, data_end))) {
+          return false;
+        }
+        if (UNLIKELY(
+                !CopyCollections(item, item_desc.collection_slots(), data, data_end))) {
+          return false;
+        }
+        coll_data += item_desc.byte_size();
+      }
+    }
+  }
+  return true;
+}
+
+void BufferedTupleStream::GetTupleRow(FlatRowPtr flat_row, TupleRow* row) const {
+  DCHECK(row != nullptr);
+  DCHECK(!closed_);
+  DCHECK(is_pinned());
+  DCHECK(!delete_on_read_);
+  uint8_t* data = flat_row;
+  return has_nullable_tuple_ ? UnflattenTupleRow<true>(&data, row) :
+                               UnflattenTupleRow<false>(&data, row);
+}
+
+template <bool HAS_NULLABLE_TUPLE>
+void BufferedTupleStream::UnflattenTupleRow(uint8_t** data, TupleRow* row) const {
+  const int tuples_per_row = desc_->tuple_descriptors().size();
+  uint8_t* ptr = *data;
+  if (has_nullable_tuple_) {
+    // Stitch together the tuples from the page and the NULL ones.
+    const uint8_t* null_indicators = ptr;
+    ptr += NullIndicatorBytesPerRow();
+    for (int i = 0; i < tuples_per_row; ++i) {
+      const uint8_t* null_word = null_indicators + (i >> 3);
+      const uint32_t null_pos = i & 7;
+      const bool is_not_null = ((*null_word & (1 << (7 - null_pos))) == 0);
+      row->SetTuple(
+          i, reinterpret_cast<Tuple*>(reinterpret_cast<uint64_t>(ptr) * is_not_null));
+      ptr += fixed_tuple_sizes_[i] * is_not_null;
+    }
+  } else {
+    for (int i = 0; i < tuples_per_row; ++i) {
+      row->SetTuple(i, reinterpret_cast<Tuple*>(ptr));
+      ptr += fixed_tuple_sizes_[i];
+    }
+  }
+  *data = ptr;
+}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/runtime/buffered-tuple-stream.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-tuple-stream.h b/be/src/runtime/buffered-tuple-stream.h
new file mode 100644
index 0000000..dbf3faf
--- /dev/null
+++ b/be/src/runtime/buffered-tuple-stream.h
@@ -0,0 +1,705 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef IMPALA_RUNTIME_BUFFERED_TUPLE_STREAM_H
+#define IMPALA_RUNTIME_BUFFERED_TUPLE_STREAM_H
+
+#include <set>
+#include <vector>
+#include <boost/scoped_ptr.hpp>
+#include <boost/function.hpp>
+
+#include "common/global-types.h"
+#include "common/status.h"
+#include "gutil/macros.h"
+#include "runtime/bufferpool/buffer-pool.h"
+#include "runtime/row-batch.h"
+
+namespace impala {
+
+class MemTracker;
+class RuntimeState;
+class RowDescriptor;
+class SlotDescriptor;
+class Tuple;
+class TupleRow;
+
+/// Class that provides an abstraction for a stream of tuple rows backed by BufferPool
+/// Pages. Rows can be added to the stream and read back. Rows are returned in the order
+/// they are added.
+///
+/// The BufferedTupleStream is *not* thread safe from the caller's point of view.
+/// Different threads should not concurrently call methods of the same BufferedTupleStream
+/// object.
+///
+/// Reading and writing the stream:
+/// The stream supports two modes of reading/writing, depending on whether
+/// PrepareForWrite() is called to initialize a write iterator only or
+/// PrepareForReadWrite() is called to initialize both read and write iterators to enable
+/// interleaved reads and writes.
+///
+/// To use write-only mode, PrepareForWrite() is called once and AddRow()/AddRowCustom*()
+/// are called repeatedly to initialize then advance a write iterator through the stream.
+/// Once the stream is fully written, it can be read back by calling PrepareForRead()
+/// then GetNext() repeatedly to advance a read iterator through the stream, or by
+/// calling GetRows() to get all of the rows at once.
+///
+/// To use read/write mode, PrepareForReadWrite() is called once to initialize the read
+/// and write iterators. AddRow()/AddRowCustom*() then advance a write iterator through
+/// the stream, and GetNext() advances a trailing read iterator through the stream.
+///
+/// Buffer management:
+/// The tuple stream is backed by a sequence of BufferPool Pages. The tuple stream uses
+/// the client's reservation to pin pages in memory. It will automatically try to
+/// increase the client's reservation whenever it needs to do so to make progress.
+///
+/// Normally pages are all of the same default page length, but larger pages up to the
+/// max page length are used if needed to store rows that are too large for a
+/// default-length page.
+///
+/// The stream has both pinned and unpinned modes. In the pinned mode all pages are
+/// pinned for reading. The pinned mode avoids I/O by keeping all pages pinned in memory
+/// and allows clients to save pointers to rows in the stream and randomly access them.
+/// E.g. hash tables can be backed by a BufferedTupleStream. In the unpinned mode, only
+/// pages currently being read and written are pinned and other pages are unpinned and
+/// therefore do not use the client's reservation and can be spilled to disk. The stream
+/// always holds onto a default page's worth of reservation for the read and write
+/// iterators (i.e. two page's worth if the stream is in read/write mode), even if that
+/// many pages are not currently pinned. This means that UnpinStream() always succeeds,
+/// and moving to the next default-length write page or read page on an unpinned stream
+/// does not require additional reservation. This is implemented by saving reservations
+/// in SubReservations.
+///
+/// To read or write a row larger than the default page size to/from an unpinned stream,
+/// the client must have max_page_len - default_page_len unused reservation. Writing a
+/// large row to an unpinned stream only uses the reservation for the duration of the
+/// AddRow()/AddRowCustom*() call. Reading a large row from an unpinned stream uses the
+/// reservation until the next call to GetNext(). E.g. to partition a single unpinned
+/// stream into n unpinned streams, the reservation needed is (n - 1) *
+/// default_page_len + 2 * max_page_len: one large read buffer and one large write
+/// buffer is needed to keep the row being processed in-memory, but only default-sized
+/// buffers are needed for the other streams being written.
+///
+/// The tuple stream also supports a 'delete_on_read' mode, enabled by passing a flag
+/// to PrepareForRead() which deletes the stream's pages as it does a final read
+/// pass over the stream.
+///
+/// TODO: IMPALA-4179: the buffer management can be simplified once we can attach
+/// buffers to RowBatches.
+///
+/// Page layout:
+/// Rows are stored back to back starting at the first byte of each page's buffer, with
+/// no interleaving of data from different rows. There is no padding or alignment
+/// between rows. Rows larger than the default page length are stored on their own
+/// page.
+///
+/// Tuple row layout:
+/// If the stream's tuples are nullable (i.e. has_nullable_tuple_ is true), there is a
+/// bitstring at the start of each row with null indicators for all tuples in each row
+/// (including non-nullable tuples). The bitstring occupies ceil(num_tuples_per_row / 8)
+/// bytes. A 1 indicates the tuple is null.
+///
+/// The fixed length parts of the row's tuples are stored first, followed by var len data
+/// for inlined_string_slots_ and inlined_coll_slots_. Other "external" var len slots can
+/// point to var len data outside the stream. When reading the stream, the length of each
+/// row's var len data in the stream must be computed to find the next row's start.
+///
+/// The tuple stream supports reading from the stream into RowBatches without copying
+/// out any data: the RowBatches' Tuple pointers will point directly into the stream's
+/// pages' buffers. The fixed length parts follow Impala's internal tuple format, so for
+/// the tuple to be valid, we only need to update pointers to point to the var len data
+/// in the stream. These pointers need to be updated by the stream because a spilled
+/// page's data may be relocated to a different buffer. The pointers are updated lazily
+/// upon reading the stream via GetNext() or GetRows().
+///
+/// Example layout for a row with two non-nullable tuples ((1, "hello"), (2, "world"))
+/// with all var len data stored in the stream:
+///  <---- tuple 1 -----> <------ tuple 2 ------> <- var len -> <- next row ...
+/// +--------+-----------+-----------+-----------+-------------+
+/// | IntVal | StringVal | BigIntVal | StringVal |             | ...
+/// +--------+-----------+-----------+-----------++------------+
+/// | val: 1 | len: 5    | val: 2    | len: 5    | helloworld  | ...
+/// |        | ptr: 0x.. |           | ptr: 0x.. |             | ...
+/// +--------+-----------+-----------+-----------+-------------+
+///  <--4b--> <---12b---> <----8b---> <---12b---> <----10b---->
+///
+/// Example layout for a row with the second tuple nullable ((1, "hello"), NULL)
+/// with all var len data stored in the stream:
+/// <- null tuple bitstring -> <---- tuple 1 -----> <- var len -> <- next row ...
+/// +-------------------------+--------+-----------+------------+
+/// |                         | IntVal | StringVal |            | ...
+/// +-------------------------+--------+-----------+------------+
+/// | 0000 0010               | val: 1 | len: 5    | hello      | ...
+/// |                         |        | ptr: 0x.. |            | ...
+/// +-------------------------+--------+-----------+------------+
+///  <---------1b------------> <--4b--> <---12b---> <----5b---->
+///
+/// Example layout for a row with a single non-nullable tuple (("hello", "world")) with
+/// the second string slot stored externally to the stream:
+///  <------ tuple 1 ------> <- var len ->  <- next row ...
+/// +-----------+-----------+-------------+
+/// | StringVal | StringVal |             | ...
+/// +-----------+-----------+-------------+
+/// | len: 5    | len: 5    |  hello      | ...
+/// | ptr: 0x.. | ptr: 0x.. |             | ...
+/// +-----------+-----------+-------------+
+///  <---12b---> <---12b---> <-----5b---->
+///
+/// The behavior of reads and writes is as follows:
+/// Read:
+///   1. Unpinned: Only a single read page is pinned at a time. This means that only
+///     enough reservation to pin a single page is needed to read the stream, regardless
+///     of the stream's size. Each page is deleted or unpinned (if delete on read is true
+///     or false respectively) before advancing to the next page.
+///   2. Pinned: All pages in the stream are pinned so do not need to be pinned or
+///     unpinned when reading from the stream. If delete on read is true, pages are
+///     deleted after being read. If the stream was previously unpinned, the page's data
+///     may not yet be in memory - reading from the stream can block on I/O or fail with
+///     an I/O error.
+/// Write:
+///   1. Unpinned: Unpin pages as they fill up. This means that only a enough reservation
+///     to pin a single write page is required to write to the stream, regardless of the
+///     stream's size.
+///   2. Pinned: Pages are left pinned. If the next page in the stream cannot be pinned
+///     because the client's reservation is insufficient (and could not be increased by
+///     the stream), the read call will fail and the client can either unpin the stream
+///     or free up other memory before retrying.
+///
+/// Memory lifetime of rows read from stream:
+/// If the stream is pinned and delete on read is false, it is valid to access any tuples
+/// returned via GetNext() or GetRows() until the stream is unpinned. If the stream is
+/// unpinned or delete on read is true, then the batch returned from GetNext() may have
+/// the needs_deep_copy flag set, which means that any tuple memory returned so far from
+/// the stream may be freed on the next call to GetNext().
+/// TODO: IMPALA-4179, instead of needs_deep_copy, attach the pages' buffers to the batch.
+///
+/// Manual construction of rows with AddRowCustomBegin()/AddRowCustomEnd():
+/// The BufferedTupleStream supports allocation of uninitialized rows with
+/// AddRowCustom*(). AddRowCustomBegin() is called instead of AddRow() if the client wants
+/// to manually construct a row. The caller of AddRowCustomBegin() is responsible for
+/// writing the row with exactly the layout described above then calling
+/// AddRowCustomEnd() when done.
+///
+/// If a caller constructs a tuple in this way, the caller can set the pointers and they
+/// will not be modified until the stream is read via GetNext() or GetRows().
+/// TODO: IMPALA-5007: try to remove AddRowCustom*() by unifying with AddRow().
+///
+/// TODO: we need to be able to do read ahead for pages. We need some way to indicate a
+/// page will need to be pinned soon.
+class BufferedTupleStream {
+ public:
+  /// A pointer to the start of a flattened TupleRow in the stream.
+  typedef uint8_t* FlatRowPtr;
+
+  /// row_desc: description of rows stored in the stream. This is the desc for rows
+  /// that are added and the rows being returned.
+  /// page_len: the size of pages to use in the stream
+  /// ext_varlen_slots: set of varlen slots with data stored externally to the stream
+  BufferedTupleStream(RuntimeState* state, const RowDescriptor* row_desc,
+      BufferPool::ClientHandle* buffer_pool_client, int64_t default_page_len,
+      int64_t max_page_len,
+      const std::set<SlotId>& ext_varlen_slots = std::set<SlotId>());
+
+  virtual ~BufferedTupleStream();
+
+  /// Initializes the tuple stream object on behalf of node 'node_id'. Must be called
+  /// once before any of the other APIs.
+  /// If 'pinned' is true, the tuple stream starts off pinned, otherwise it is unpinned.
+  /// 'node_id' is only used for error reporting.
+  Status Init(int node_id, bool pinned) WARN_UNUSED_RESULT;
+
+  /// Prepares the stream for writing by saving enough reservation for a default-size
+  /// write page. Tries to increase reservation if there is not enough unused reservation
+  /// for a page. Called after Init() and before the first AddRow() or
+  /// AddRowCustomBegin() call.
+  /// 'got_reservation': set to true if there was enough reservation to initialize the
+  ///     first write page and false if there was not enough reservation and no other
+  ///     error was encountered. Undefined if an error status is returned.
+  Status PrepareForWrite(bool* got_reservation) WARN_UNUSED_RESULT;
+
+  /// Prepares the stream for interleaved reads and writes by saving enough reservation
+  /// for default-sized read and write pages. Called after Init() and before the first
+  /// AddRow() or AddRowCustomBegin() call.
+  /// 'delete_on_read': Pages are deleted after they are read.
+  /// 'got_reservation': set to true if there was enough reservation to initialize the
+  ///     read and write pages and false if there was not enough reservation and no other
+  ///     error was encountered. Undefined if an error status is returned.
+  Status PrepareForReadWrite(
+      bool delete_on_read, bool* got_reservation) WARN_UNUSED_RESULT;
+
+  /// Prepares the stream for reading, invalidating the write iterator (if there is one).
+  /// Therefore must be called after the last AddRow() or AddRowCustomEnd() and before
+  /// GetNext(). PrepareForRead() can be called multiple times to do multiple read passes
+  /// over the stream, unless rows were read from the stream after PrepareForRead() or
+  /// PrepareForReadWrite() was called with delete_on_read = true.
+  /// 'delete_on_read': Pages are deleted after they are read.
+  /// 'got_reservation': set to true if there was enough reservation to initialize the
+  ///     first read page and false if there was not enough reservation and no other
+  ///     error was encountered. Undefined if an error status is returned.
+  Status PrepareForRead(bool delete_on_read, bool* got_reservation) WARN_UNUSED_RESULT;
+
+  /// Adds a single row to the stream. There are three possible outcomes:
+  /// a) The append succeeds. True is returned.
+  /// b) The append fails because the unused reservation was not sufficient to add
+  ///   a new page to the stream large enough to fit 'row' and the stream could not
+  ///   increase the reservation to get enough unused reservation. Returns false and
+  ///   sets 'status' to OK. The append can be retried after freeing up memory or
+  ///   unpinning the stream.
+  /// c) The append fails with a runtime error. Returns false and sets 'status' to an
+  ///   error.
+  /// d) The append fails becase the row is too large to fit in a page of a stream.
+  ///   Returns false and sets 'status' to an error.
+  ///
+  /// Unpinned streams can only encounter case b) when appending a row larger than
+  /// the default page size and the reservation could not be increased sufficiently.
+  /// Otherwise enough memory is automatically freed up by unpinning the current write
+  /// page.
+  ///
+  /// BufferedTupleStream will do a deep copy of the memory in the row. After AddRow()
+  /// returns an error, it should not be called again.
+  bool AddRow(TupleRow* row, Status* status) noexcept WARN_UNUSED_RESULT;
+
+  /// Allocates space to store a row of 'size' bytes (including fixed and variable length
+  /// data). If successful, returns a pointer to the allocated row. The caller then must
+  /// writes valid data to the row and call AddRowCustomEnd().
+  ///
+  /// If unsuccessful, returns nullptr. The failure modes are the same as described in the
+  /// AddRow() comment.
+  ALWAYS_INLINE uint8_t* AddRowCustomBegin(int64_t size, Status* status);
+
+  /// Called after AddRowCustomBegin() when done writing the row. Only should be called
+  /// if AddRowCustomBegin() succeeded. See the AddRowCustomBegin() comment for
+  /// explanation.
+  /// 'size': the size passed into AddRowCustomBegin().
+  void AddRowCustomEnd(int64_t size);
+
+  /// Unflattens 'flat_row' into a regular TupleRow 'row'. Only valid to call if the
+  /// stream is pinned. The row must have been allocated with the stream's row desc.
+  /// The returned 'row' is backed by memory from the stream so is only valid as long
+  /// as the stream is pinned.
+  void GetTupleRow(FlatRowPtr flat_row, TupleRow* row) const;
+
+  /// Pins all pages in this stream and switches to pinned mode. Has no effect if the
+  /// stream is already pinned.
+  /// If the current unused reservation is not sufficient to pin the stream in memory,
+  /// this will try to increase the reservation. If that fails, 'pinned' is set to false
+  /// and the stream is left unpinned. Otherwise 'pinned' is set to true.
+  Status PinStream(bool* pinned) WARN_UNUSED_RESULT;
+
+  /// Modes for UnpinStream().
+  enum UnpinMode {
+    /// All pages in the stream are unpinned and the read/write positions in the stream
+    /// are reset. No more rows can be written to the stream after this. The stream can
+    /// be re-read from the beginning by calling PrepareForRead().
+    UNPIN_ALL,
+    /// All pages are unpinned aside from the current read and write pages (if any),
+    /// which is left in the same state. The unpinned stream can continue being read
+    /// or written from the current read or write positions.
+    UNPIN_ALL_EXCEPT_CURRENT,
+  };
+
+  /// Unpins stream with the given 'mode' as described above.
+  void UnpinStream(UnpinMode mode);
+
+  /// Get the next batch of output rows, which are backed by the stream's memory.
+  /// If the stream is unpinned or 'delete_on_read' is true, the 'needs_deep_copy'
+  /// flag may be set on 'batch' to signal that memory will be freed on the next
+  /// call to GetNext() and that the caller should copy out any data it needs from
+  /// rows in 'batch' or in previous batches returned from GetNext().
+  ///
+  /// If the stream is pinned and 'delete_on_read' is false, the memory backing the
+  /// rows will remain valid until the stream is unpinned, destroyed, etc.
+  /// TODO: IMPALA-4179: update when we simplify the memory transfer model.
+  Status GetNext(RowBatch* batch, bool* eos) WARN_UNUSED_RESULT;
+
+  /// Same as above, but populate 'flat_rows' with a pointer to the flat version of
+  /// each returned row in the pinned stream. The pointers in 'flat_rows' are only
+  /// valid as long as the stream remains pinned.
+  Status GetNext(
+      RowBatch* batch, bool* eos, std::vector<FlatRowPtr>* flat_rows) WARN_UNUSED_RESULT;
+
+  /// Returns all the rows in the stream in batch. This pins the entire stream in the
+  /// process. If the current unused reservation is not sufficient to pin the stream in
+  /// memory, this will try to increase the reservation. If that fails, 'got_rows' is set
+  /// to false.
+  Status GetRows(MemTracker* tracker, boost::scoped_ptr<RowBatch>* batch,
+      bool* got_rows) WARN_UNUSED_RESULT;
+
+  /// Must be called once at the end to cleanup all resources. If 'batch' is non-NULL,
+  /// attaches buffers from pinned pages that rows returned from GetNext() may reference.
+  /// Otherwise deletes all pages. Does nothing if the stream was already closed. The
+  /// 'flush' mode is forwarded to RowBatch::AddBuffer() when attaching buffers.
+  void Close(RowBatch* batch, RowBatch::FlushMode flush);
+
+  /// Number of rows in the stream.
+  int64_t num_rows() const { return num_rows_; }
+
+  /// Number of rows returned via GetNext().
+  int64_t rows_returned() const { return rows_returned_; }
+
+  /// Returns the byte size necessary to store the entire stream in memory.
+  int64_t byte_size() const { return total_byte_size_; }
+
+  /// Returns the number of bytes currently pinned in memory by the stream.
+  /// If ignore_current is true, the write_page_ memory is not included.
+  int64_t BytesPinned(bool ignore_current) const {
+    if (ignore_current && write_page_ != nullptr && write_page_->is_pinned()) {
+      return bytes_pinned_ - write_page_->len();
+    }
+    return bytes_pinned_;
+  }
+
+  bool is_closed() const { return closed_; }
+  bool is_pinned() const { return pinned_; }
+  bool has_read_iterator() const { return has_read_iterator_; }
+  bool has_write_iterator() const { return has_write_iterator_; }
+
+  std::string DebugString() const;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(BufferedTupleStream);
+  friend class ArrayTupleStreamTest_TestArrayDeepCopy_Test;
+  friend class ArrayTupleStreamTest_TestComputeRowSize_Test;
+  friend class MultiNullableTupleStreamTest_TestComputeRowSize_Test;
+  friend class SimpleTupleStreamTest_TestGetRowsOverflow_Test;
+
+  /// Wrapper around BufferPool::PageHandle that tracks additional info about the page.
+  struct Page {
+    Page() : num_rows(0), retrieved_buffer(true) {}
+
+    inline int len() const { return handle.len(); }
+    inline bool is_pinned() const { return handle.is_pinned(); }
+    inline int pin_count() const { return handle.pin_count(); }
+    Status GetBuffer(const BufferPool::BufferHandle** buffer) {
+      RETURN_IF_ERROR(handle.GetBuffer(buffer));
+      retrieved_buffer = true;
+      return Status::OK();
+    }
+    std::string DebugString() const;
+
+    BufferPool::PageHandle handle;
+
+    /// Number of rows written to the page.
+    int num_rows;
+
+    /// Whether we called GetBuffer() on the page since it was last pinned. This means
+    /// that GetBuffer() and ExtractBuffer() cannot fail and that GetNext() may have
+    /// returned rows referencing the page's buffer.
+    bool retrieved_buffer;
+  };
+
+  /// Runtime state instance used to check for cancellation. Not owned.
+  RuntimeState* const state_;
+
+  /// Description of rows stored in the stream.
+  const RowDescriptor* desc_;
+
+  /// Plan node ID, used for error reporting.
+  int node_id_;
+
+  /// The size of the fixed length portion for each tuple in the row.
+  std::vector<int> fixed_tuple_sizes_;
+
+  /// Vectors of all the strings slots that have their varlen data stored in stream
+  /// grouped by tuple_idx.
+  std::vector<std::pair<int, std::vector<SlotDescriptor*>>> inlined_string_slots_;
+
+  /// Vectors of all the collection slots that have their varlen data stored in the
+  /// stream, grouped by tuple_idx.
+  std::vector<std::pair<int, std::vector<SlotDescriptor*>>> inlined_coll_slots_;
+
+  /// Buffer pool and client used to allocate, pin and release pages. Not owned.
+  BufferPool* buffer_pool_;
+  BufferPool::ClientHandle* buffer_pool_client_;
+
+  /// List of pages in the stream.
+  /// Empty iff one of two cases applies:
+  /// * before the first row has been added with AddRow() or AddRowCustom().
+  /// * after the stream has been destructively read in 'delete_on_read' mode
+  std::list<Page> pages_;
+  // IMPALA-5629: avoid O(n) list.size() call by explicitly tracking the number of pages.
+  // TODO: remove when we switch to GCC5+, where list.size() is O(1). See GCC bug #49561.
+  int64_t num_pages_;
+
+  /// Total size of pages_, including any pages already deleted in 'delete_on_read'
+  /// mode.
+  int64_t total_byte_size_;
+
+  /// True if there is currently an active read iterator for the stream.
+  bool has_read_iterator_;
+
+  /// The current page being read. When no read iterator is active, equal to list.end().
+  /// When a read iterator is active, either points to the current read page, or equals
+  /// list.end() if no rows have yet been read.  GetNext() does not advance this past
+  /// the end of the stream, so upon eos 'read_page_' points to the last page and
+  /// rows_returned_ == num_rows_. Always pinned, unless a Pin() call failed and an error
+  /// status was returned.
+  std::list<Page>::iterator read_page_;
+
+  /// Saved reservation for read iterator. 'default_page_len_' reservation is saved if
+  /// there is a read iterator, no pinned read page, and the possibility that the read
+  /// iterator will advance to a valid page.
+  BufferPool::SubReservation read_page_reservation_;
+
+  /// Number of rows returned from the current read_page_.
+  uint32_t read_page_rows_returned_;
+
+  /// Pointer into read_page_ to the byte after the last row read.
+  uint8_t* read_ptr_;
+
+  /// Pointer to one byte past the end of read_page_. Used to detect overruns.
+  const uint8_t* read_end_ptr_;
+
+  /// Pointer into write_page_ to the byte after the last row written.
+  uint8_t* write_ptr_;
+
+  /// Pointer to one byte past the end of write_page_. Cached to speed up computation
+  const uint8_t* write_end_ptr_;
+
+  /// Number of rows returned to the caller from GetNext() since the last
+  /// PrepareForRead() call.
+  int64_t rows_returned_;
+
+  /// True if there is currently an active write iterator into the stream.
+  bool has_write_iterator_;
+
+  /// The current page for writing. NULL if there is no write iterator or no current
+  /// write page. Always pinned. Size is 'default_page_len_', except temporarily while
+  /// appending a larger row between AddRowCustomBegin() and AddRowCustomEnd().
+  Page* write_page_;
+
+  /// Saved reservation for write iterator. 'default_page_len_' reservation is saved if
+  /// there is a write iterator, no page currently pinned for writing and the possibility
+  /// that a pin count will be needed for the write iterator in future. Specifically if:
+  /// * no rows have been appended to the stream and 'pages_' is empty, or
+  /// * the stream is unpinned, 'write_page_' is null and and the last page in 'pages_'
+  ///   is a large page that we advanced past, or
+  /// * there is only one pinned page in the stream and it is already pinned for reading.
+  BufferPool::SubReservation write_page_reservation_;
+
+  /// Total bytes of pinned pages in pages_, stored to avoid iterating over the list
+  /// to compute it.
+  int64_t bytes_pinned_;
+
+  /// Number of rows stored in the stream. Includes rows that were already deleted during
+  /// a destructive 'delete_on_read' pass over the stream.
+  int64_t num_rows_;
+
+  /// The default length in bytes of pages used to store the stream's rows. All rows that
+  /// fit in a default-sized page are stored in default-sized page.
+  const int64_t default_page_len_;
+
+  /// The maximum length in bytes of pages used to store the stream's rows. This is a
+  /// hard limit on the maximum size of row that can be stored in the stream and the
+  /// amount of reservation required to read or write to an unpinned stream.
+  const int64_t max_page_len_;
+
+  /// Whether any tuple in the rows is nullable.
+  const bool has_nullable_tuple_;
+
+  /// If true, pages are deleted after they are read during this read pass. Once rows
+  /// have been read from a stream with 'delete_on_read_' true, this is always true.
+  bool delete_on_read_;
+
+  bool closed_; // Used for debugging.
+
+  /// If true, this stream has been explicitly pinned by the caller and all pages are
+  /// kept pinned until the caller calls UnpinStream().
+  bool pinned_;
+
+  bool is_read_page(const Page* page) const {
+    return read_page_ != pages_.end() && &*read_page_ == page;
+  }
+
+  bool is_write_page(const Page* page) const { return write_page_ == page; }
+
+  /// Return true if the read and write page are the same.
+  bool has_read_write_page() const {
+    return write_page_ != nullptr && is_read_page(write_page_);
+  }
+
+  /// The slow path for AddRow() that is called if there is not sufficient space in
+  /// the current page.
+  bool AddRowSlow(TupleRow* row, Status* status) noexcept;
+
+  /// The slow path for AddRowCustomBegin() that is called if there is not sufficient space in
+  /// the current page.
+  uint8_t* AddRowCustomBeginSlow(int64_t size, Status* status) noexcept;
+
+  /// The slow path for AddRowCustomEnd() that is called for large pages.
+  void AddLargeRowCustomEnd(int64_t size) noexcept;
+
+  /// Copies 'row' into the buffer starting at *data and ending at the byte before
+  /// 'data_end'. On success, returns true and updates *data to point after the last
+  /// byte written. Returns false if there is not enough space in the buffer provided.
+  bool DeepCopy(TupleRow* row, uint8_t** data, const uint8_t* data_end) noexcept;
+
+  /// Templated implementation of DeepCopy().
+  template <bool HAS_NULLABLE_TUPLE>
+  bool DeepCopyInternal(TupleRow* row, uint8_t** data, const uint8_t* data_end) noexcept;
+
+  /// Helper function to copy strings in string_slots from tuple into *data.
+  /// Updates *data to the end of the string data added. Returns false if the data
+  /// does not fit in the buffer [*data, data_end).
+  static bool CopyStrings(const Tuple* tuple,
+      const std::vector<SlotDescriptor*>& string_slots, uint8_t** data,
+      const uint8_t* data_end);
+
+  /// Helper function to deep copy collections in collection_slots from tuple into
+  /// the buffer [*data, data_end). Updates *data to the end of the collection data
+  /// added. Returns false if the data does not fit in the buffer.
+  static bool CopyCollections(const Tuple* tuple,
+      const std::vector<SlotDescriptor*>& collection_slots, uint8_t** data,
+      const uint8_t* data_end);
+
+  /// Gets a new page of 'page_len' bytes from buffer_pool_, updating write_page_,
+  /// write_ptr_ and write_end_ptr_. The caller must ensure there is 'page_len' unused
+  /// reservation. The caller must reset the write page (if there is one) before calling.
+  Status NewWritePage(int64_t page_len) noexcept WARN_UNUSED_RESULT;
+
+  /// Determines what page size is needed to fit a row of 'row_size' bytes.
+  /// Returns an error if the row cannot fit in a page.
+  Status CalcPageLenForRow(int64_t row_size, int64_t* page_len);
+
+  /// Wrapper around NewWritePage() that allocates a new write page that fits a row of
+  /// 'row_size' bytes. Increases reservation if needed to allocate the next page.
+  /// Returns OK and sets 'got_reservation' to true if the write page was successfully
+  /// allocated. Returns an error if the row cannot fit in a page. Returns OK and sets
+  /// 'got_reservation' to false if the reservation could not be increased and no other
+  /// error was encountered.
+  Status AdvanceWritePage(
+      int64_t row_size, bool* got_reservation) noexcept WARN_UNUSED_RESULT;
+
+  /// Reset the write page, if there is one, and unpin pages accordingly. If there
+  /// is an active write iterator, the next row will be appended to a new page.
+  void ResetWritePage();
+
+  /// Invalidate the write iterator and release any resources associated with it. After
+  /// calling this, no more rows can be appended to the stream.
+  void InvalidateWriteIterator();
+
+  /// Same as PrepareForRead(), except the iterators are not invalidated and
+  /// the caller is assumed to have checked there is sufficient unused reservation.
+  Status PrepareForReadInternal(bool delete_on_read) WARN_UNUSED_RESULT;
+
+  /// Pins the next read page. This blocks reading from disk if necessary to bring the
+  /// page's data into memory. Updates read_page_, read_ptr_, and
+  /// read_page_rows_returned_.
+  Status NextReadPage() WARN_UNUSED_RESULT;
+
+  /// Invalidate the read iterator, and release any resources associated with the active
+  /// iterator.
+  void InvalidateReadIterator();
+
+  /// Returns the total additional bytes that this row will consume in write_page_ if
+  /// appended to the page. This includes the row's null indicators, the fixed length
+  /// part of the row and the data for inlined_string_slots_ and inlined_coll_slots_.
+  int64_t ComputeRowSize(TupleRow* row) const noexcept;
+
+  /// Pins page and updates tracking stats.
+  Status PinPage(Page* page) WARN_UNUSED_RESULT;
+
+  /// Increment the page's pin count if this page needs a higher pin count given the
+  /// current read and write iterator positions and whether the stream will be pinned
+  /// ('stream_pinned'). Assumes that no scenarios occur when the pin count needs to
+  /// be incremented multiple times. The caller is responsible for ensuring sufficient
+  /// reservation is available.
+  Status PinPageIfNeeded(Page* page, bool stream_pinned) WARN_UNUSED_RESULT;
+
+  /// Decrement the page's pin count if this page needs a lower pin count given the
+  /// current read and write iterator positions and whether the stream will be pinned
+  /// ('stream_pinned'). Assumes that no scenarios occur when the pin count needs to
+  /// be decremented multiple times.
+  void UnpinPageIfNeeded(Page* page, bool stream_pinned);
+
+  /// Return the expected pin count for 'page' in the current stream based on the current
+  /// read and write pages and whether the stream is pinned.
+  int ExpectedPinCount(bool stream_pinned, const Page* page) const;
+
+  /// Return true if the stream in its current state needs to have a reservation for
+  /// a write page stored in 'write_page_reservation_'.
+  bool NeedWriteReservation() const;
+
+  /// Same as above, except assume the stream's 'pinned_' state is 'stream_pinned'.
+  bool NeedWriteReservation(bool stream_pinned) const;
+
+  /// Same as above, except assume the stream has 'num_pages' pages and different
+  /// iterator state.
+  static bool NeedWriteReservation(bool stream_pinned, int64_t num_pages,
+      bool has_write_iterator, bool has_write_page, bool has_read_write_page);
+
+  /// Return true if the stream in its current state needs to have a reservation for
+  /// a read page stored in 'read_page_reservation_'.
+  bool NeedReadReservation() const;
+
+  /// Same as above, except assume the stream's 'pinned_' state is 'stream_pinned'.
+  bool NeedReadReservation(bool stream_pinned) const;
+
+  /// Same as above, except assume the stream has 'num_pages' pages and a different
+  /// read iterator state.
+  bool NeedReadReservation(bool stream_pinned, int64_t num_pages, bool has_read_iterator,
+      bool has_read_page) const;
+
+  /// Same as above, except assume the stream has 'num_pages' pages and a different
+  /// write iterator state.
+  static bool NeedReadReservation(bool stream_pinned, int64_t num_pages,
+      bool has_read_iterator, bool has_read_page, bool has_write_iterator,
+      bool has_write_page);
+
+  /// Templated GetNext implementations.
+  template <bool FILL_FLAT_ROWS>
+  Status GetNextInternal(RowBatch* batch, bool* eos, std::vector<FlatRowPtr>* flat_rows);
+  template <bool FILL_FLAT_ROWS, bool HAS_NULLABLE_TUPLE>
+  Status GetNextInternal(RowBatch* batch, bool* eos, std::vector<FlatRowPtr>* flat_rows);
+
+  /// Helper function to convert a flattened TupleRow stored starting at '*data' into
+  /// 'row'. *data is updated to point to the first byte past the end of the row.
+  template <bool HAS_NULLABLE_TUPLE>
+  void UnflattenTupleRow(uint8_t** data, TupleRow* row) const;
+
+  /// Helper function for GetNextInternal(). For each string slot in string_slots,
+  /// update StringValue's ptr field to point to the corresponding string data stored
+  /// inline in the stream (at the current value of read_ptr_) advance read_ptr_ by the
+  /// StringValue's length field.
+  void FixUpStringsForRead(const vector<SlotDescriptor*>& string_slots, Tuple* tuple);
+
+  /// Helper function for GetNextInternal(). For each collection slot in collection_slots,
+  /// recursively update any pointers in the CollectionValue to point to the corresponding
+  /// var len data stored inline in the stream, advancing read_ptr_ as data is read.
+  /// Assumes that the collection was serialized to the stream in DeepCopy()'s format.
+  void FixUpCollectionsForRead(
+      const vector<SlotDescriptor*>& collection_slots, Tuple* tuple);
+
+  /// Returns the number of null indicator bytes per row. Only valid if this stream has
+  /// nullable tuples.
+  int NullIndicatorBytesPerRow() const;
+
+  /// Returns the total bytes pinned. Only called in DCHECKs to validate bytes_pinned_.
+  int64_t CalcBytesPinned() const;
+
+  /// DCHECKs if the stream is internally inconsistent. The stream should always be in
+  /// a consistent state after returning success from a public API call. The Fast version
+  /// has constant runtime and does not check all of 'pages_'. The Full version includes
+  /// O(n) checks that require iterating over the whole 'pages_' list (e.g. checking that
+  /// each page is in a valid state).
+  void CheckConsistencyFast() const;
+  void CheckConsistencyFull() const;
+  void CheckPageConsistency(const Page* page) const;
+};
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/runtime/buffered-tuple-stream.inline.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-tuple-stream.inline.h b/be/src/runtime/buffered-tuple-stream.inline.h
new file mode 100644
index 0000000..2e1aad7
--- /dev/null
+++ b/be/src/runtime/buffered-tuple-stream.inline.h
@@ -0,0 +1,56 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef IMPALA_RUNTIME_BUFFERED_TUPLE_STREAM_INLINE_H
+#define IMPALA_RUNTIME_BUFFERED_TUPLE_STREAM_INLINE_H
+
+#include "runtime/buffered-tuple-stream.h"
+
+#include "runtime/descriptors.h"
+#include "runtime/tuple-row.h"
+#include "util/bit-util.h"
+
+namespace impala {
+
+inline int BufferedTupleStream::NullIndicatorBytesPerRow() const {
+  DCHECK(has_nullable_tuple_);
+  return BitUtil::RoundUpNumBytes(fixed_tuple_sizes_.size());
+}
+
+inline uint8_t* BufferedTupleStream::AddRowCustomBegin(int64_t size, Status* status) {
+  DCHECK(!closed_);
+  DCHECK(has_write_iterator());
+  if (UNLIKELY(write_page_ == nullptr || write_ptr_ + size > write_end_ptr_)) {
+    return AddRowCustomBeginSlow(size, status);
+  }
+  DCHECK(write_page_ != nullptr);
+  DCHECK(write_page_->is_pinned());
+  DCHECK_LE(write_ptr_ + size, write_end_ptr_);
+  ++num_rows_;
+  ++write_page_->num_rows;
+
+  uint8_t* data = write_ptr_;
+  write_ptr_ += size;
+  return data;
+}
+
+inline void BufferedTupleStream::AddRowCustomEnd(int64_t size) {
+  if (UNLIKELY(size > default_page_len_)) AddLargeRowCustomEnd(size);
+}
+}
+
+#endif


[06/11] incubator-impala git commit: IMPALA-4674: Part 2.5: Rename BufferedTupleStreamV2

Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/runtime/buffered-tuple-stream-v2-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-tuple-stream-v2-test.cc b/be/src/runtime/buffered-tuple-stream-v2-test.cc
deleted file mode 100644
index 7e4cef8..0000000
--- a/be/src/runtime/buffered-tuple-stream-v2-test.cc
+++ /dev/null
@@ -1,1462 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <boost/bind.hpp>
-#include <boost/filesystem.hpp>
-#include <boost/scoped_ptr.hpp>
-
-#include <limits> // for std::numeric_limits<int>::max()
-#include <set>
-#include <string>
-
-#include "codegen/llvm-codegen.h"
-#include "gutil/gscoped_ptr.h"
-#include "runtime/buffered-tuple-stream-v2.inline.h"
-#include "runtime/query-state.h"
-#include "runtime/bufferpool/reservation-tracker.h"
-#include "runtime/collection-value-builder.h"
-#include "runtime/collection-value.h"
-#include "runtime/raw-value.h"
-#include "runtime/row-batch.h"
-#include "runtime/string-value.inline.h"
-#include "runtime/test-env.h"
-#include "runtime/tmp-file-mgr.h"
-#include "service/fe-support.h"
-#include "testutil/desc-tbl-builder.h"
-#include "testutil/gtest-util.h"
-#include "util/test-info.h"
-
-#include "gen-cpp/ImpalaInternalService_types.h"
-#include "gen-cpp/Types_types.h"
-
-#include "common/names.h"
-
-using kudu::FreeDeleter;
-using std::numeric_limits;
-
-static const int BATCH_SIZE = 250;
-// Allow arbitrarily small pages in our test buffer pool.
-static const int MIN_PAGE_LEN = 1;
-// Limit the size of the buffer pool to bound memory consumption.
-static const int64_t BUFFER_POOL_LIMIT = 1024L * 1024L * 1024L;
-
-// The page length to use for the streams.
-static const int PAGE_LEN = 2 * 1024 * 1024;
-static const uint32_t PRIME = 479001599;
-
-namespace impala {
-
-static const StringValue STRINGS[] = {
-    StringValue("ABC"), StringValue("HELLO"), StringValue("123456789"),
-    StringValue("FOOBAR"), StringValue("ONE"), StringValue("THREE"),
-    StringValue("abcdefghijklmno"), StringValue("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
-    StringValue("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"),
-};
-
-static const int NUM_STRINGS = sizeof(STRINGS) / sizeof(StringValue);
-
-class SimpleTupleStreamTest : public testing::Test {
- protected:
-  virtual void SetUp() {}
-
-  virtual void CreateDescriptors() {
-    vector<bool> nullable_tuples(1, false);
-    vector<TTupleId> tuple_ids(1, static_cast<TTupleId>(0));
-
-    DescriptorTblBuilder int_builder(test_env_->exec_env()->frontend(), &pool_);
-    int_builder.DeclareTuple() << TYPE_INT;
-    int_desc_ =
-        pool_.Add(new RowDescriptor(*int_builder.Build(), tuple_ids, nullable_tuples));
-
-    DescriptorTblBuilder string_builder(test_env_->exec_env()->frontend(), &pool_);
-    string_builder.DeclareTuple() << TYPE_STRING;
-    string_desc_ =
-        pool_.Add(new RowDescriptor(*string_builder.Build(), tuple_ids, nullable_tuples));
-
-    // Construct descriptors for big rows with and without nullable tuples.
-    // Each tuple contains 8 slots of TYPE_INT and a single byte for null indicator.
-    DescriptorTblBuilder big_row_builder(test_env_->exec_env()->frontend(), &pool_);
-    tuple_ids.clear();
-    nullable_tuples.clear();
-    vector<bool> non_nullable_tuples;
-    const int num_tuples = BIG_ROW_BYTES / (8 * sizeof(int) + 1);
-    for (int tuple_idx = 0; tuple_idx < num_tuples; ++tuple_idx) {
-      big_row_builder.DeclareTuple() << TYPE_INT << TYPE_INT << TYPE_INT << TYPE_INT
-                                     << TYPE_INT << TYPE_INT << TYPE_INT << TYPE_INT;
-      tuple_ids.push_back(static_cast<TTupleId>(tuple_idx));
-      nullable_tuples.push_back(true);
-      non_nullable_tuples.push_back(false);
-    }
-    big_row_desc_ = pool_.Add(
-        new RowDescriptor(*big_row_builder.Build(), tuple_ids, non_nullable_tuples));
-    ASSERT_FALSE(big_row_desc_->IsAnyTupleNullable());
-    nullable_big_row_desc_ = pool_.Add(
-        new RowDescriptor(*big_row_builder.Build(), tuple_ids, nullable_tuples));
-  }
-
-  virtual void TearDown() {
-    if (client_.is_registered()) {
-      test_env_->exec_env()->buffer_pool()->DeregisterClient(&client_);
-    }
-    runtime_state_ = nullptr;
-    pool_.Clear();
-    mem_pool_->FreeAll();
-    test_env_.reset();
-  }
-
-  /// Set up all of the test state: the buffer pool, a query state, a client with no
-  /// reservation and any other descriptors, etc.
-  /// The buffer pool's capacity is limited to 'buffer_pool_limit'.
-  void Init(int64_t buffer_pool_limit) {
-    test_env_.reset(new TestEnv());
-    test_env_->SetBufferPoolArgs(MIN_PAGE_LEN, buffer_pool_limit);
-    ASSERT_OK(test_env_->Init());
-
-    CreateDescriptors();
-    mem_pool_.reset(new MemPool(&tracker_));
-
-    ASSERT_OK(test_env_->CreateQueryState(0, nullptr, &runtime_state_));
-    query_state_ = runtime_state_->query_state();
-
-    RuntimeProfile* client_profile = pool_.Add(new RuntimeProfile(&pool_, "client"));
-    MemTracker* client_tracker =
-        pool_.Add(new MemTracker(-1, "client", runtime_state_->instance_mem_tracker()));
-    ASSERT_OK(test_env_->exec_env()->buffer_pool()->RegisterClient("",
-        query_state_->file_group(), runtime_state_->instance_buffer_reservation(),
-        client_tracker, numeric_limits<int>::max(), client_profile, &client_));
-  }
-
-  /// Generate the ith element of a sequence of int values.
-  int GenIntValue(int i) {
-    // Multiply by large prime to get varied bit patterns.
-    return i * PRIME;
-  }
-
-  /// Generate the ith element of a sequence of bool values.
-  bool GenBoolValue(int i) {
-    // Use a middle bit of the int value.
-    return ((GenIntValue(i) >> 8) & 0x1) != 0;
-  }
-
-  /// Count the total number of slots per row based on the given row descriptor.
-  int CountSlotsPerRow(const RowDescriptor& row_desc) {
-    int slots_per_row = 0;
-    for (int i = 0; i < row_desc.tuple_descriptors().size(); ++i) {
-      TupleDescriptor* tuple_desc = row_desc.tuple_descriptors()[i];
-      slots_per_row += tuple_desc->slots().size();
-    }
-    return slots_per_row;
-  }
-
-  /// Allocate a row batch with 'num_rows' of rows with layout described by 'row_desc'.
-  /// 'offset' is used to account for rows occupied by any previous row batches. This is
-  /// needed to match the values generated in VerifyResults(). If 'gen_null' is true,
-  /// some tuples will be set to NULL.
-  virtual RowBatch* CreateBatch(
-      const RowDescriptor* row_desc, int offset, int num_rows, bool gen_null) {
-    RowBatch* batch = pool_.Add(new RowBatch(row_desc, num_rows, &tracker_));
-    int num_tuples = row_desc->tuple_descriptors().size();
-
-    int idx = offset * CountSlotsPerRow(*row_desc);
-    for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
-      TupleRow* row = batch->GetRow(row_idx);
-      for (int tuple_idx = 0; tuple_idx < num_tuples; ++tuple_idx) {
-        TupleDescriptor* tuple_desc = row_desc->tuple_descriptors()[tuple_idx];
-        Tuple* tuple = Tuple::Create(tuple_desc->byte_size(), batch->tuple_data_pool());
-        bool is_null = gen_null && !GenBoolValue(idx);
-        for (int slot_idx = 0; slot_idx < tuple_desc->slots().size(); ++slot_idx, ++idx) {
-          SlotDescriptor* slot_desc = tuple_desc->slots()[slot_idx];
-          void* slot = tuple->GetSlot(slot_desc->tuple_offset());
-          switch (slot_desc->type().type) {
-            case TYPE_INT:
-              *reinterpret_cast<int*>(slot) = GenIntValue(idx);
-              break;
-            case TYPE_STRING:
-              *reinterpret_cast<StringValue*>(slot) = STRINGS[idx % NUM_STRINGS];
-              break;
-            default:
-              // The memory has been zero'ed out already by Tuple::Create().
-              break;
-          }
-        }
-        if (is_null) {
-          row->SetTuple(tuple_idx, nullptr);
-        } else {
-          row->SetTuple(tuple_idx, tuple);
-        }
-      }
-      batch->CommitLastRow();
-    }
-    return batch;
-  }
-
-  virtual RowBatch* CreateIntBatch(int offset, int num_rows, bool gen_null) {
-    return CreateBatch(int_desc_, offset, num_rows, gen_null);
-  }
-
-  virtual RowBatch* CreateStringBatch(int offset, int num_rows, bool gen_null) {
-    return CreateBatch(string_desc_, offset, num_rows, gen_null);
-  }
-
-  void AppendValue(uint8_t* ptr, vector<int>* results) {
-    if (ptr == nullptr) {
-      // For the tests indicate null-ability using the max int value
-      results->push_back(numeric_limits<int>::max());
-    } else {
-      results->push_back(*reinterpret_cast<int*>(ptr));
-    }
-  }
-
-  void AppendValue(uint8_t* ptr, vector<StringValue>* results) {
-    if (ptr == nullptr) {
-      results->push_back(StringValue());
-    } else {
-      StringValue sv = *reinterpret_cast<StringValue*>(ptr);
-      uint8_t* copy = mem_pool_->Allocate(sv.len);
-      memcpy(copy, sv.ptr, sv.len);
-      sv.ptr = reinterpret_cast<char*>(copy);
-      results->push_back(sv);
-    }
-  }
-
-  template <typename T>
-  void AppendRowTuples(TupleRow* row, RowDescriptor* row_desc, vector<T>* results) {
-    DCHECK(row != nullptr);
-    const int num_tuples = row_desc->tuple_descriptors().size();
-
-    for (int tuple_idx = 0; tuple_idx < num_tuples; ++tuple_idx) {
-      TupleDescriptor* tuple_desc = row_desc->tuple_descriptors()[tuple_idx];
-      Tuple* tuple = row->GetTuple(tuple_idx);
-      const int num_slots = tuple_desc->slots().size();
-      for (int slot_idx = 0; slot_idx < num_slots; ++slot_idx) {
-        SlotDescriptor* slot_desc = tuple_desc->slots()[slot_idx];
-        if (tuple == nullptr) {
-          AppendValue(nullptr, results);
-        } else {
-          void* slot = tuple->GetSlot(slot_desc->tuple_offset());
-          AppendValue(reinterpret_cast<uint8_t*>(slot), results);
-        }
-      }
-    }
-  }
-
-  template <typename T>
-  void ReadValues(BufferedTupleStreamV2* stream, RowDescriptor* desc, vector<T>* results,
-      int num_batches = -1) {
-    bool eos = false;
-    RowBatch batch(desc, BATCH_SIZE, &tracker_);
-    int batches_read = 0;
-    do {
-      batch.Reset();
-      EXPECT_OK(stream->GetNext(&batch, &eos));
-      ++batches_read;
-      for (int i = 0; i < batch.num_rows(); ++i) {
-        AppendRowTuples(batch.GetRow(i), desc, results);
-      }
-    } while (!eos && (num_batches < 0 || batches_read <= num_batches));
-  }
-
-  void GetExpectedValue(int idx, bool is_null, int* val) {
-    if (is_null) {
-      *val = numeric_limits<int>::max();
-    } else {
-      *val = GenIntValue(idx);
-    }
-  }
-
-  void GetExpectedValue(int idx, bool is_null, StringValue* val) {
-    if (is_null) {
-      *val = StringValue();
-    } else {
-      *val = STRINGS[idx % NUM_STRINGS];
-    }
-  }
-
-  template <typename T>
-  void VerifyResults(const RowDescriptor& row_desc, const vector<T>& results,
-      int num_rows, bool gen_null) {
-    int idx = 0;
-    for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
-      const int num_tuples = row_desc.tuple_descriptors().size();
-      for (int tuple_idx = 0; tuple_idx < num_tuples; ++tuple_idx) {
-        const TupleDescriptor* tuple_desc = row_desc.tuple_descriptors()[tuple_idx];
-        const int num_slots = tuple_desc->slots().size();
-        bool is_null = gen_null && !GenBoolValue(idx);
-        for (int slot_idx = 0; slot_idx < num_slots; ++slot_idx, ++idx) {
-          T expected_val;
-          GetExpectedValue(idx, is_null, &expected_val);
-          ASSERT_EQ(results[idx], expected_val)
-              << "results[" << idx << "] " << results[idx] << " != " << expected_val
-              << " row_idx=" << row_idx << " tuple_idx=" << tuple_idx
-              << " slot_idx=" << slot_idx << " gen_null=" << gen_null;
-        }
-      }
-    }
-    DCHECK_EQ(results.size(), idx);
-  }
-
-  // Test adding num_batches of ints to the stream and reading them back.
-  // If unpin_stream is true, operate the stream in unpinned mode.
-  // Assumes that enough buffers are available to read and write the stream.
-  template <typename T>
-  void TestValues(int num_batches, RowDescriptor* desc, bool gen_null, bool unpin_stream,
-      int64_t default_page_len = PAGE_LEN, int64_t max_page_len = -1,
-      int num_rows = BATCH_SIZE) {
-    if (max_page_len == -1) max_page_len = default_page_len;
-
-    BufferedTupleStreamV2 stream(
-        runtime_state_, desc, &client_, default_page_len, max_page_len);
-    ASSERT_OK(stream.Init(-1, true));
-    bool got_write_reservation;
-    ASSERT_OK(stream.PrepareForWrite(&got_write_reservation));
-    ASSERT_TRUE(got_write_reservation);
-
-    if (unpin_stream) {
-      stream.UnpinStream(BufferedTupleStreamV2::UNPIN_ALL_EXCEPT_CURRENT);
-    }
-    // Add rows to the stream
-    int offset = 0;
-    for (int i = 0; i < num_batches; ++i) {
-      RowBatch* batch = nullptr;
-
-      Status status;
-      ASSERT_TRUE(sizeof(T) == sizeof(int) || sizeof(T) == sizeof(StringValue));
-      batch = CreateBatch(desc, offset, num_rows, gen_null);
-      for (int j = 0; j < batch->num_rows(); ++j) {
-        // TODO: test that AddRow succeeds after freeing memory.
-        bool b = stream.AddRow(batch->GetRow(j), &status);
-        ASSERT_OK(status);
-        ASSERT_TRUE(b);
-      }
-      offset += batch->num_rows();
-      // Reset the batch to make sure the stream handles the memory correctly.
-      batch->Reset();
-    }
-
-    bool got_read_reservation;
-    ASSERT_OK(stream.PrepareForRead(false, &got_read_reservation));
-    ASSERT_TRUE(got_read_reservation);
-
-    // Read all the rows back
-    vector<T> results;
-    ReadValues(&stream, desc, &results);
-
-    // Verify result
-    VerifyResults<T>(*desc, results, num_rows * num_batches, gen_null);
-
-    stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-  }
-
-  void TestIntValuesInterleaved(int num_batches, int num_batches_before_read,
-      bool unpin_stream, int64_t page_len = PAGE_LEN) {
-    BufferedTupleStreamV2 stream(runtime_state_, int_desc_, &client_, page_len, page_len);
-    ASSERT_OK(stream.Init(-1, true));
-    bool got_reservation;
-    ASSERT_OK(stream.PrepareForReadWrite(true, &got_reservation));
-    ASSERT_TRUE(got_reservation);
-    if (unpin_stream) {
-      stream.UnpinStream(BufferedTupleStreamV2::UNPIN_ALL_EXCEPT_CURRENT);
-    }
-
-    vector<int> results;
-    for (int i = 0; i < num_batches; ++i) {
-      RowBatch* batch = CreateIntBatch(i * BATCH_SIZE, BATCH_SIZE, false);
-      for (int j = 0; j < batch->num_rows(); ++j) {
-        Status status;
-        bool b = stream.AddRow(batch->GetRow(j), &status);
-        ASSERT_TRUE(b);
-        ASSERT_OK(status);
-      }
-      // Reset the batch to make sure the stream handles the memory correctly.
-      batch->Reset();
-      if (i % num_batches_before_read == 0) {
-        ReadValues(&stream, int_desc_, &results, (rand() % num_batches_before_read) + 1);
-      }
-    }
-    ReadValues(&stream, int_desc_, &results);
-
-    VerifyResults<int>(*int_desc_, results, BATCH_SIZE * num_batches, false);
-
-    stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-  }
-
-  void TestUnpinPin(bool varlen_data, bool read_write);
-
-  void TestTransferMemory(bool pinned_stream, bool read_write);
-
-  // Helper to writes 'row' comprised of only string slots to 'data'. The expected
-  // length of the data written is 'expected_len'.
-  void WriteStringRow(const RowDescriptor* row_desc, TupleRow* row, int64_t fixed_size,
-      int64_t varlen_size, uint8_t* data);
-
-  // The temporary runtime environment used for the test.
-  scoped_ptr<TestEnv> test_env_;
-  RuntimeState* runtime_state_;
-  QueryState* query_state_;
-
-  // Buffer pool client - automatically deregistered in TearDown().
-  BufferPool::ClientHandle client_;
-
-  // Dummy MemTracker used for miscellaneous memory.
-  MemTracker tracker_;
-  ObjectPool pool_;
-  RowDescriptor* int_desc_;
-  RowDescriptor* string_desc_;
-
-  static const int64_t BIG_ROW_BYTES = 16 * 1024;
-  RowDescriptor* big_row_desc_;
-  RowDescriptor* nullable_big_row_desc_;
-  scoped_ptr<MemPool> mem_pool_;
-};
-
-// Tests with a non-NULLable tuple per row.
-class SimpleNullStreamTest : public SimpleTupleStreamTest {
- protected:
-  virtual void CreateDescriptors() {
-    vector<bool> nullable_tuples(1, true);
-    vector<TTupleId> tuple_ids(1, static_cast<TTupleId>(0));
-
-    DescriptorTblBuilder int_builder(test_env_->exec_env()->frontend(), &pool_);
-    int_builder.DeclareTuple() << TYPE_INT;
-    int_desc_ =
-        pool_.Add(new RowDescriptor(*int_builder.Build(), tuple_ids, nullable_tuples));
-
-    DescriptorTblBuilder string_builder(test_env_->exec_env()->frontend(), &pool_);
-    string_builder.DeclareTuple() << TYPE_STRING;
-    string_desc_ =
-        pool_.Add(new RowDescriptor(*string_builder.Build(), tuple_ids, nullable_tuples));
-  }
-}; // SimpleNullStreamTest
-
-// Tests with multiple non-NULLable tuples per row.
-class MultiTupleStreamTest : public SimpleTupleStreamTest {
- protected:
-  virtual void CreateDescriptors() {
-    vector<bool> nullable_tuples;
-    nullable_tuples.push_back(false);
-    nullable_tuples.push_back(false);
-    nullable_tuples.push_back(false);
-
-    vector<TTupleId> tuple_ids;
-    tuple_ids.push_back(static_cast<TTupleId>(0));
-    tuple_ids.push_back(static_cast<TTupleId>(1));
-    tuple_ids.push_back(static_cast<TTupleId>(2));
-
-    DescriptorTblBuilder int_builder(test_env_->exec_env()->frontend(), &pool_);
-    int_builder.DeclareTuple() << TYPE_INT;
-    int_builder.DeclareTuple() << TYPE_INT;
-    int_builder.DeclareTuple() << TYPE_INT;
-    int_desc_ =
-        pool_.Add(new RowDescriptor(*int_builder.Build(), tuple_ids, nullable_tuples));
-
-    DescriptorTblBuilder string_builder(test_env_->exec_env()->frontend(), &pool_);
-    string_builder.DeclareTuple() << TYPE_STRING;
-    string_builder.DeclareTuple() << TYPE_STRING;
-    string_builder.DeclareTuple() << TYPE_STRING;
-    string_desc_ =
-        pool_.Add(new RowDescriptor(*string_builder.Build(), tuple_ids, nullable_tuples));
-  }
-};
-
-// Tests with multiple NULLable tuples per row.
-class MultiNullableTupleStreamTest : public SimpleTupleStreamTest {
- protected:
-  virtual void CreateDescriptors() {
-    vector<bool> nullable_tuples;
-    nullable_tuples.push_back(false);
-    nullable_tuples.push_back(true);
-    nullable_tuples.push_back(true);
-
-    vector<TTupleId> tuple_ids;
-    tuple_ids.push_back(static_cast<TTupleId>(0));
-    tuple_ids.push_back(static_cast<TTupleId>(1));
-    tuple_ids.push_back(static_cast<TTupleId>(2));
-
-    DescriptorTblBuilder int_builder(test_env_->exec_env()->frontend(), &pool_);
-    int_builder.DeclareTuple() << TYPE_INT;
-    int_builder.DeclareTuple() << TYPE_INT;
-    int_builder.DeclareTuple() << TYPE_INT;
-    int_desc_ =
-        pool_.Add(new RowDescriptor(*int_builder.Build(), tuple_ids, nullable_tuples));
-
-    DescriptorTblBuilder string_builder(test_env_->exec_env()->frontend(), &pool_);
-    string_builder.DeclareTuple() << TYPE_STRING;
-    string_builder.DeclareTuple() << TYPE_STRING;
-    string_builder.DeclareTuple() << TYPE_STRING;
-    string_desc_ =
-        pool_.Add(new RowDescriptor(*string_builder.Build(), tuple_ids, nullable_tuples));
-  }
-};
-
-/// Tests with collection types.
-class ArrayTupleStreamTest : public SimpleTupleStreamTest {
- protected:
-  RowDescriptor* array_desc_;
-
-  virtual void CreateDescriptors() {
-    // tuples: (array<string>, array<array<int>>) (array<int>)
-    vector<bool> nullable_tuples(2, true);
-    vector<TTupleId> tuple_ids;
-    tuple_ids.push_back(static_cast<TTupleId>(0));
-    tuple_ids.push_back(static_cast<TTupleId>(1));
-    ColumnType string_array_type;
-    string_array_type.type = TYPE_ARRAY;
-    string_array_type.children.push_back(TYPE_STRING);
-
-    ColumnType int_array_type;
-    int_array_type.type = TYPE_ARRAY;
-    int_array_type.children.push_back(TYPE_STRING);
-
-    ColumnType nested_array_type;
-    nested_array_type.type = TYPE_ARRAY;
-    nested_array_type.children.push_back(int_array_type);
-
-    DescriptorTblBuilder builder(test_env_->exec_env()->frontend(), &pool_);
-    builder.DeclareTuple() << string_array_type << nested_array_type;
-    builder.DeclareTuple() << int_array_type;
-    array_desc_ =
-        pool_.Add(new RowDescriptor(*builder.Build(), tuple_ids, nullable_tuples));
-  }
-};
-
-// Basic API test. No data should be going to disk.
-TEST_F(SimpleTupleStreamTest, Basic) {
-  Init(numeric_limits<int64_t>::max());
-  TestValues<int>(0, int_desc_, false, true);
-  TestValues<int>(1, int_desc_, false, true);
-  TestValues<int>(10, int_desc_, false, true);
-  TestValues<int>(100, int_desc_, false, true);
-  TestValues<int>(0, int_desc_, false, false);
-  TestValues<int>(1, int_desc_, false, false);
-  TestValues<int>(10, int_desc_, false, false);
-  TestValues<int>(100, int_desc_, false, false);
-
-  TestValues<StringValue>(0, string_desc_, false, true);
-  TestValues<StringValue>(1, string_desc_, false, true);
-  TestValues<StringValue>(10, string_desc_, false, true);
-  TestValues<StringValue>(100, string_desc_, false, true);
-  TestValues<StringValue>(0, string_desc_, false, false);
-  TestValues<StringValue>(1, string_desc_, false, false);
-  TestValues<StringValue>(10, string_desc_, false, false);
-  TestValues<StringValue>(100, string_desc_, false, false);
-
-  TestIntValuesInterleaved(0, 1, true);
-  TestIntValuesInterleaved(1, 1, true);
-  TestIntValuesInterleaved(10, 5, true);
-  TestIntValuesInterleaved(100, 15, true);
-  TestIntValuesInterleaved(0, 1, false);
-  TestIntValuesInterleaved(1, 1, false);
-  TestIntValuesInterleaved(10, 5, false);
-  TestIntValuesInterleaved(100, 15, false);
-}
-
-// Test with only 1 buffer.
-TEST_F(SimpleTupleStreamTest, OneBufferSpill) {
-  // Each buffer can only hold 128 ints, so this spills quite often.
-  int buffer_size = 128 * sizeof(int);
-  Init(buffer_size);
-  TestValues<int>(0, int_desc_, false, true, buffer_size);
-  TestValues<int>(1, int_desc_, false, true, buffer_size);
-  TestValues<int>(10, int_desc_, false, true, buffer_size);
-
-  TestValues<StringValue>(0, string_desc_, false, true, buffer_size);
-  TestValues<StringValue>(1, string_desc_, false, true, buffer_size);
-  TestValues<StringValue>(10, string_desc_, false, true, buffer_size);
-}
-
-// Test with a few buffers.
-TEST_F(SimpleTupleStreamTest, ManyBufferSpill) {
-  int buffer_size = 128 * sizeof(int);
-  Init(10 * buffer_size);
-
-  TestValues<int>(0, int_desc_, false, true, buffer_size);
-  TestValues<int>(1, int_desc_, false, true, buffer_size);
-  TestValues<int>(10, int_desc_, false, true, buffer_size);
-  TestValues<int>(100, int_desc_, false, true, buffer_size);
-  TestValues<StringValue>(0, string_desc_, false, true, buffer_size);
-  TestValues<StringValue>(1, string_desc_, false, true, buffer_size);
-  TestValues<StringValue>(10, string_desc_, false, true, buffer_size);
-  TestValues<StringValue>(100, string_desc_, false, true, buffer_size);
-
-  TestIntValuesInterleaved(0, 1, true, buffer_size);
-  TestIntValuesInterleaved(1, 1, true, buffer_size);
-  TestIntValuesInterleaved(10, 5, true, buffer_size);
-  TestIntValuesInterleaved(100, 15, true, buffer_size);
-}
-
-void SimpleTupleStreamTest::TestUnpinPin(bool varlen_data, bool read_write) {
-  int buffer_size = 128 * sizeof(int);
-  int num_buffers = 10;
-  Init(num_buffers * buffer_size);
-  RowDescriptor* row_desc = varlen_data ? string_desc_ : int_desc_;
-
-  BufferedTupleStreamV2 stream(
-      runtime_state_, row_desc, &client_, buffer_size, buffer_size);
-  ASSERT_OK(stream.Init(-1, true));
-  if (read_write) {
-    bool got_reservation = false;
-    ASSERT_OK(stream.PrepareForReadWrite(false, &got_reservation));
-    ASSERT_TRUE(got_reservation);
-  } else {
-    bool got_write_reservation;
-    ASSERT_OK(stream.PrepareForWrite(&got_write_reservation));
-    ASSERT_TRUE(got_write_reservation);
-  }
-
-  int offset = 0;
-  bool full = false;
-  int num_batches = 0;
-  while (!full) {
-    // Make sure we can switch between pinned and unpinned states while writing.
-    if (num_batches % 10 == 0) {
-      bool pinned;
-      stream.UnpinStream(BufferedTupleStreamV2::UNPIN_ALL_EXCEPT_CURRENT);
-      ASSERT_OK(stream.PinStream(&pinned));
-      DCHECK(pinned);
-    }
-
-    RowBatch* batch = varlen_data ? CreateStringBatch(offset, BATCH_SIZE, false) :
-                                    CreateIntBatch(offset, BATCH_SIZE, false);
-    int j = 0;
-    for (; j < batch->num_rows(); ++j) {
-      Status status;
-      full = !stream.AddRow(batch->GetRow(j), &status);
-      ASSERT_OK(status);
-      if (full) break;
-    }
-    offset += j;
-    ++num_batches;
-  }
-
-  stream.UnpinStream(BufferedTupleStreamV2::UNPIN_ALL_EXCEPT_CURRENT);
-
-  bool pinned = false;
-  ASSERT_OK(stream.PinStream(&pinned));
-  ASSERT_TRUE(pinned);
-
-  // Read and verify result a few times. We should be able to reread the stream if
-  // we don't use delete on read mode.
-  int read_iters = 3;
-  for (int i = 0; i < read_iters; ++i) {
-    bool delete_on_read = i == read_iters - 1;
-    if (i > 0 || !read_write) {
-      bool got_read_reservation;
-      ASSERT_OK(stream.PrepareForRead(delete_on_read, &got_read_reservation));
-      ASSERT_TRUE(got_read_reservation);
-    }
-
-    if (varlen_data) {
-      vector<StringValue> results;
-      ReadValues(&stream, row_desc, &results);
-      VerifyResults<StringValue>(*string_desc_, results, offset, false);
-    } else {
-      vector<int> results;
-      ReadValues(&stream, row_desc, &results);
-      VerifyResults<int>(*int_desc_, results, offset, false);
-    }
-  }
-
-  // After delete_on_read, all blocks aside from the last should be deleted.
-  // Note: this should really be 0, but the BufferedTupleStreamV2 returns eos before
-  // deleting the last block, rather than after, so the last block isn't deleted
-  // until the stream is closed.
-  ASSERT_EQ(stream.BytesPinned(false), buffer_size);
-
-  stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-
-  ASSERT_EQ(stream.BytesPinned(false), 0);
-}
-
-TEST_F(SimpleTupleStreamTest, UnpinPin) {
-  TestUnpinPin(false, false);
-}
-
-TEST_F(SimpleTupleStreamTest, UnpinPinReadWrite) {
-  TestUnpinPin(false, true);
-}
-
-TEST_F(SimpleTupleStreamTest, UnpinPinVarlen) {
-  TestUnpinPin(false, false);
-}
-
-void SimpleTupleStreamTest::TestTransferMemory(bool pin_stream, bool read_write) {
-  // Use smaller buffers so that the explicit FLUSH_RESOURCES flag is required to
-  // make the batch at capacity.
-  int buffer_size = 4 * 1024;
-  Init(100 * buffer_size);
-
-  BufferedTupleStreamV2 stream(
-      runtime_state_, int_desc_, &client_, buffer_size, buffer_size);
-  ASSERT_OK(stream.Init(-1, pin_stream));
-  if (read_write) {
-    bool got_reservation;
-    ASSERT_OK(stream.PrepareForReadWrite(true, &got_reservation));
-    ASSERT_TRUE(got_reservation);
-  } else {
-    bool got_write_reservation;
-    ASSERT_OK(stream.PrepareForWrite(&got_write_reservation));
-    ASSERT_TRUE(got_write_reservation);
-  }
-  RowBatch* batch = CreateIntBatch(0, 1024, false);
-
-  // Construct a stream with 4 pages.
-  const int total_num_pages = 4;
-  while (stream.byte_size() < total_num_pages * buffer_size) {
-    Status status;
-    for (int i = 0; i < batch->num_rows(); ++i) {
-      bool ret = stream.AddRow(batch->GetRow(i), &status);
-      EXPECT_TRUE(ret);
-      ASSERT_OK(status);
-    }
-  }
-
-  batch->Reset();
-
-  if (read_write) {
-    // Read back batch so that we have a read buffer in memory.
-    bool eos;
-    ASSERT_OK(stream.GetNext(batch, &eos));
-    EXPECT_FALSE(eos);
-  }
-  stream.Close(batch, RowBatch::FlushMode::FLUSH_RESOURCES);
-  if (pin_stream) {
-    EXPECT_EQ(total_num_pages, batch->num_buffers());
-  } else if (read_write) {
-    // Read and write buffer should be attached.
-    EXPECT_EQ(2, batch->num_buffers());
-  } else {
-    // Read buffer should be attached.
-    EXPECT_EQ(1, batch->num_buffers());
-  }
-  EXPECT_TRUE(batch->AtCapacity()); // Flush resources flag should have been set.
-  batch->Reset();
-  EXPECT_EQ(0, batch->num_buffers());
-}
-
-/// Test attaching memory to a row batch from a pinned stream.
-TEST_F(SimpleTupleStreamTest, TransferMemoryFromPinnedStreamReadWrite) {
-  TestTransferMemory(true, true);
-}
-
-TEST_F(SimpleTupleStreamTest, TransferMemoryFromPinnedStreamNoReadWrite) {
-  TestTransferMemory(true, false);
-}
-
-/// Test attaching memory to a row batch from an unpinned stream.
-TEST_F(SimpleTupleStreamTest, TransferMemoryFromUnpinnedStreamReadWrite) {
-  TestTransferMemory(false, true);
-}
-
-TEST_F(SimpleTupleStreamTest, TransferMemoryFromUnpinnedStreamNoReadWrite) {
-  TestTransferMemory(false, false);
-}
-
-// Test that tuple stream functions if it references strings outside stream. The
-// aggregation node relies on this since it updates tuples in-place.
-TEST_F(SimpleTupleStreamTest, StringsOutsideStream) {
-  int buffer_size = 8 * 1024 * 1024;
-  Init(2 * buffer_size);
-  Status status = Status::OK();
-
-  int num_batches = 100;
-  int rows_added = 0;
-  DCHECK_EQ(string_desc_->tuple_descriptors().size(), 1);
-  TupleDescriptor& tuple_desc = *string_desc_->tuple_descriptors()[0];
-
-  set<SlotId> external_slots;
-  for (int i = 0; i < tuple_desc.string_slots().size(); ++i) {
-    external_slots.insert(tuple_desc.string_slots()[i]->id());
-  }
-
-  BufferedTupleStreamV2 stream(
-      runtime_state_, string_desc_, &client_, buffer_size, buffer_size, external_slots);
-  ASSERT_OK(stream.Init(0, false));
-  bool got_reservation;
-  ASSERT_OK(stream.PrepareForWrite(&got_reservation));
-  ASSERT_TRUE(got_reservation);
-
-  for (int i = 0; i < num_batches; ++i) {
-    RowBatch* batch = CreateStringBatch(rows_added, BATCH_SIZE, false);
-    for (int j = 0; j < batch->num_rows(); ++j) {
-      int fixed_size = tuple_desc.byte_size();
-      // Copy fixed portion in, but leave it pointing to row batch's varlen data.
-      uint8_t* tuple_data = stream.AddRowCustomBegin(fixed_size, &status);
-      ASSERT_TRUE(tuple_data != nullptr);
-      ASSERT_TRUE(status.ok());
-      memcpy(tuple_data, batch->GetRow(j)->GetTuple(0), fixed_size);
-      stream.AddRowCustomEnd(fixed_size);
-    }
-    rows_added += batch->num_rows();
-  }
-
-  DCHECK_EQ(rows_added, stream.num_rows());
-
-  for (int delete_on_read = 0; delete_on_read <= 1; ++delete_on_read) {
-    // Keep stream in memory and test we can read ok.
-    vector<StringValue> results;
-    bool got_read_reservation;
-    ASSERT_OK(stream.PrepareForRead(delete_on_read, &got_read_reservation));
-    ASSERT_TRUE(got_read_reservation);
-    ReadValues(&stream, string_desc_, &results);
-    VerifyResults<StringValue>(*string_desc_, results, rows_added, false);
-  }
-
-  stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-// Construct a big row by stiching together many tuples so the total row size
-// will be close to the IO block size. With null indicators, stream will fail to
-// be initialized; Without null indicators, things should work fine.
-TEST_F(SimpleTupleStreamTest, BigRow) {
-  const int64_t MAX_BUFFERS = 10;
-  Init(MAX_BUFFERS * BIG_ROW_BYTES);
-
-  // Test writing this row into the stream and then reading it back.
-  // Make sure to exercise the case where the row is larger than the default page.
-  // If the stream is pinned, we can only fit MAX_BUFFERS - 1 rows (since we always
-  // advance to the next page). In the unpinned case we should be able to write
-  // arbitrarily many rows.
-  TestValues<int>(1, big_row_desc_, false, false, BIG_ROW_BYTES, BIG_ROW_BYTES, 1);
-  TestValues<int>(
-      MAX_BUFFERS - 1, big_row_desc_, false, false, BIG_ROW_BYTES, BIG_ROW_BYTES, 1);
-  TestValues<int>(1, big_row_desc_, false, false, BIG_ROW_BYTES / 4, BIG_ROW_BYTES, 1);
-  TestValues<int>(
-      MAX_BUFFERS - 1, big_row_desc_, false, false, BIG_ROW_BYTES / 4, BIG_ROW_BYTES, 1);
-  TestValues<int>(1, big_row_desc_, false, true, BIG_ROW_BYTES, BIG_ROW_BYTES, 1);
-  TestValues<int>(
-      MAX_BUFFERS - 1, big_row_desc_, false, true, BIG_ROW_BYTES, BIG_ROW_BYTES, 1);
-  TestValues<int>(
-      5 * MAX_BUFFERS, big_row_desc_, false, true, BIG_ROW_BYTES, BIG_ROW_BYTES, 1);
-  TestValues<int>(1, big_row_desc_, false, true, BIG_ROW_BYTES / 4, BIG_ROW_BYTES, 1);
-  TestValues<int>(
-      MAX_BUFFERS - 1, big_row_desc_, false, true, BIG_ROW_BYTES / 4, BIG_ROW_BYTES, 1);
-  TestValues<int>(
-      5 * MAX_BUFFERS, big_row_desc_, false, true, BIG_ROW_BYTES / 4, BIG_ROW_BYTES, 1);
-
-  // Test the case where it fits in an in-between page size.
-  TestValues<int>(MAX_BUFFERS - 1, big_row_desc_, false, false, BIG_ROW_BYTES / 4,
-      BIG_ROW_BYTES * 2, 1);
-  TestValues<int>(MAX_BUFFERS - 1, big_row_desc_, false, true, BIG_ROW_BYTES / 4,
-      BIG_ROW_BYTES * 2, 1);
-
-  // Construct a big row with nullable tuples. This requires extra space for null
-  // indicators in the stream so adding the row will fail.
-  ASSERT_TRUE(nullable_big_row_desc_->IsAnyTupleNullable());
-  BufferedTupleStreamV2 nullable_stream(
-      runtime_state_, nullable_big_row_desc_, &client_, BIG_ROW_BYTES, BIG_ROW_BYTES);
-  ASSERT_OK(nullable_stream.Init(-1, true));
-  bool got_reservation;
-  ASSERT_OK(nullable_stream.PrepareForWrite(&got_reservation));
-
-  // With null tuples, a row can fit in the stream.
-  RowBatch* batch = CreateBatch(nullable_big_row_desc_, 0, 1, true);
-  Status status;
-  EXPECT_TRUE(nullable_stream.AddRow(batch->GetRow(0), &status));
-  // With the additional null indicator, we can't fit all the tuples of a row into
-  // the stream.
-  batch = CreateBatch(nullable_big_row_desc_, 0, 1, false);
-  EXPECT_FALSE(nullable_stream.AddRow(batch->GetRow(0), &status));
-  EXPECT_EQ(TErrorCode::MAX_ROW_SIZE, status.code());
-  nullable_stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-// Test the memory use for large rows.
-TEST_F(SimpleTupleStreamTest, BigRowMemoryUse) {
-  const int64_t MAX_BUFFERS = 10;
-  const int64_t DEFAULT_PAGE_LEN = BIG_ROW_BYTES / 4;
-  Init(MAX_BUFFERS * BIG_ROW_BYTES);
-  Status status;
-  BufferedTupleStreamV2 stream(
-      runtime_state_, big_row_desc_, &client_, DEFAULT_PAGE_LEN, BIG_ROW_BYTES * 2);
-  ASSERT_OK(stream.Init(-1, true));
-  RowBatch* batch;
-  bool got_reservation;
-  ASSERT_OK(stream.PrepareForWrite(&got_reservation));
-  ASSERT_TRUE(got_reservation);
-  // We should be able to append MAX_BUFFERS without problem.
-  for (int i = 0; i < MAX_BUFFERS; ++i) {
-    batch = CreateBatch(big_row_desc_, i, 1, false);
-    bool success = stream.AddRow(batch->GetRow(0), &status);
-    ASSERT_TRUE(success);
-    // We should have one large page per row.
-    EXPECT_EQ(BIG_ROW_BYTES * (i + 1), client_.GetUsedReservation())
-        << i << ": " << client_.DebugString();
-  }
-
-  // We can't fit another row in memory - need to unpin to make progress.
-  batch = CreateBatch(big_row_desc_, MAX_BUFFERS, 1, false);
-  bool success = stream.AddRow(batch->GetRow(0), &status);
-  ASSERT_FALSE(success);
-  ASSERT_OK(status);
-  stream.UnpinStream(BufferedTupleStreamV2::UNPIN_ALL_EXCEPT_CURRENT);
-  success = stream.AddRow(batch->GetRow(0), &status);
-  ASSERT_TRUE(success);
-  // Read all the rows back and verify.
-  ASSERT_OK(stream.PrepareForRead(false, &got_reservation));
-  ASSERT_TRUE(got_reservation);
-  vector<int> results;
-  ReadValues(&stream, big_row_desc_, &results);
-  VerifyResults<int>(*big_row_desc_, results, MAX_BUFFERS + 1, false);
-  stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-// Test for IMPALA-3923: overflow of 32-bit int in GetRows().
-TEST_F(SimpleTupleStreamTest, TestGetRowsOverflow) {
-  Init(BUFFER_POOL_LIMIT);
-  BufferedTupleStreamV2 stream(runtime_state_, int_desc_, &client_, PAGE_LEN, PAGE_LEN);
-  ASSERT_OK(stream.Init(-1, true));
-
-  Status status;
-  // Add more rows than can be fit in a RowBatch (limited by its 32-bit row count).
-  // Actually adding the rows would take a very long time, so just set num_rows_.
-  // This puts the stream in an inconsistent state, but exercises the right code path.
-  stream.num_rows_ = 1L << 33;
-  bool got_rows;
-  scoped_ptr<RowBatch> overflow_batch;
-  ASSERT_FALSE(stream.GetRows(&tracker_, &overflow_batch, &got_rows).ok());
-  stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-// Test rows greater than the default page size. Also exercise the read/write
-// mode with large pages.
-TEST_F(SimpleTupleStreamTest, BigStringReadWrite) {
-  const int64_t MAX_BUFFERS = 10;
-  const int64_t DEFAULT_PAGE_LEN = BIG_ROW_BYTES / 4;
-  Init(MAX_BUFFERS * BIG_ROW_BYTES);
-  Status status;
-  BufferedTupleStreamV2 stream(
-      runtime_state_, string_desc_, &client_, DEFAULT_PAGE_LEN, BIG_ROW_BYTES * 2);
-  ASSERT_OK(stream.Init(-1, true));
-  RowBatch write_batch(string_desc_, 1024, &tracker_);
-  RowBatch read_batch(string_desc_, 1024, &tracker_);
-  bool got_reservation;
-  ASSERT_OK(stream.PrepareForReadWrite(false, &got_reservation));
-  ASSERT_TRUE(got_reservation);
-  TupleRow* write_row = write_batch.GetRow(0);
-  TupleDescriptor* tuple_desc = string_desc_->tuple_descriptors()[0];
-  vector<uint8_t> tuple_mem(tuple_desc->byte_size());
-  Tuple* write_tuple = reinterpret_cast<Tuple*>(tuple_mem.data());
-  write_row->SetTuple(0, write_tuple);
-  StringValue* write_str = reinterpret_cast<StringValue*>(
-      write_tuple->GetSlot(tuple_desc->slots()[0]->tuple_offset()));
-  // Make the string large enough to fill a page.
-  const int64_t string_len = BIG_ROW_BYTES - tuple_desc->byte_size();
-  vector<char> data(string_len);
-  write_str->len = string_len;
-  write_str->ptr = data.data();
-
-  // We should be able to append MAX_BUFFERS without problem.
-  for (int i = 0; i < MAX_BUFFERS; ++i) {
-    // Fill the string with the value i.
-    memset(write_str->ptr, i, write_str->len);
-    bool success = stream.AddRow(write_row, &status);
-    ASSERT_TRUE(success);
-    // We should have one large page per row, plus a default-size read/write page, plus
-    // we waste the first default-size page in the stream by leaving it empty.
-    EXPECT_EQ(BIG_ROW_BYTES * (i + 1), client_.GetUsedReservation())
-        << i << ": " << client_.DebugString() << "\n"
-        << stream.DebugString();
-
-    // Read back the rows as we write them to test read/write mode.
-    read_batch.Reset();
-    bool eos;
-    ASSERT_OK(stream.GetNext(&read_batch, &eos));
-    EXPECT_EQ(1, read_batch.num_rows());
-    EXPECT_TRUE(eos);
-    Tuple* tuple = read_batch.GetRow(0)->GetTuple(0);
-    StringValue* str = reinterpret_cast<StringValue*>(
-        tuple->GetSlot(tuple_desc->slots()[0]->tuple_offset()));
-    EXPECT_EQ(string_len, str->len);
-    for (int j = 0; j < string_len; ++j) {
-      EXPECT_EQ(i, str->ptr[j]) << j;
-    }
-  }
-
-  // We can't fit another row in memory - need to unpin to make progress.
-  memset(write_str->ptr, MAX_BUFFERS, write_str->len);
-  bool success = stream.AddRow(write_row, &status);
-  ASSERT_FALSE(success);
-  ASSERT_OK(status);
-  stream.UnpinStream(BufferedTupleStreamV2::UNPIN_ALL_EXCEPT_CURRENT);
-  success = stream.AddRow(write_row, &status);
-  ASSERT_TRUE(success);
-
-  // Read all the rows back and verify.
-  ASSERT_OK(stream.PrepareForRead(false, &got_reservation));
-  ASSERT_TRUE(got_reservation);
-  for (int i = 0; i < MAX_BUFFERS + 1; ++i) {
-    read_batch.Reset();
-    bool eos;
-    ASSERT_OK(stream.GetNext(&read_batch, &eos));
-    EXPECT_EQ(1, read_batch.num_rows());
-    EXPECT_EQ(eos, i == MAX_BUFFERS) << i;
-    Tuple* tuple = read_batch.GetRow(0)->GetTuple(0);
-    StringValue* str = reinterpret_cast<StringValue*>(
-        tuple->GetSlot(tuple_desc->slots()[0]->tuple_offset()));
-    EXPECT_EQ(string_len, str->len);
-    for (int j = 0; j < string_len; ++j) {
-      ASSERT_EQ(i, str->ptr[j]) << j;
-    }
-  }
-  stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-// Basic API test. No data should be going to disk.
-TEST_F(SimpleNullStreamTest, Basic) {
-  Init(BUFFER_POOL_LIMIT);
-  TestValues<int>(0, int_desc_, false, true);
-  TestValues<int>(1, int_desc_, false, true);
-  TestValues<int>(10, int_desc_, false, true);
-  TestValues<int>(100, int_desc_, false, true);
-  TestValues<int>(0, int_desc_, true, true);
-  TestValues<int>(1, int_desc_, true, true);
-  TestValues<int>(10, int_desc_, true, true);
-  TestValues<int>(100, int_desc_, true, true);
-  TestValues<int>(0, int_desc_, false, false);
-  TestValues<int>(1, int_desc_, false, false);
-  TestValues<int>(10, int_desc_, false, false);
-  TestValues<int>(100, int_desc_, false, false);
-  TestValues<int>(0, int_desc_, true, false);
-  TestValues<int>(1, int_desc_, true, false);
-  TestValues<int>(10, int_desc_, true, false);
-  TestValues<int>(100, int_desc_, true, false);
-
-  TestValues<StringValue>(0, string_desc_, false, true);
-  TestValues<StringValue>(1, string_desc_, false, true);
-  TestValues<StringValue>(10, string_desc_, false, true);
-  TestValues<StringValue>(100, string_desc_, false, true);
-  TestValues<StringValue>(0, string_desc_, true, true);
-  TestValues<StringValue>(1, string_desc_, true, true);
-  TestValues<StringValue>(10, string_desc_, true, true);
-  TestValues<StringValue>(100, string_desc_, true, true);
-  TestValues<StringValue>(0, string_desc_, false, false);
-  TestValues<StringValue>(1, string_desc_, false, false);
-  TestValues<StringValue>(10, string_desc_, false, false);
-  TestValues<StringValue>(100, string_desc_, false, false);
-  TestValues<StringValue>(0, string_desc_, true, false);
-  TestValues<StringValue>(1, string_desc_, true, false);
-  TestValues<StringValue>(10, string_desc_, true, false);
-  TestValues<StringValue>(100, string_desc_, true, false);
-
-  TestIntValuesInterleaved(0, 1, true);
-  TestIntValuesInterleaved(1, 1, true);
-  TestIntValuesInterleaved(10, 5, true);
-  TestIntValuesInterleaved(100, 15, true);
-  TestIntValuesInterleaved(0, 1, false);
-  TestIntValuesInterleaved(1, 1, false);
-  TestIntValuesInterleaved(10, 5, false);
-  TestIntValuesInterleaved(100, 15, false);
-}
-
-// Test tuple stream with only 1 buffer and rows with multiple tuples.
-TEST_F(MultiTupleStreamTest, MultiTupleOneBufferSpill) {
-  // Each buffer can only hold 128 ints, so this spills quite often.
-  int buffer_size = 128 * sizeof(int);
-  Init(buffer_size);
-  TestValues<int>(0, int_desc_, false, true, buffer_size);
-  TestValues<int>(1, int_desc_, false, true, buffer_size);
-  TestValues<int>(10, int_desc_, false, true, buffer_size);
-
-  TestValues<StringValue>(0, string_desc_, false, true, buffer_size);
-  TestValues<StringValue>(1, string_desc_, false, true, buffer_size);
-  TestValues<StringValue>(10, string_desc_, false, true, buffer_size);
-}
-
-// Test with a few buffers and rows with multiple tuples.
-TEST_F(MultiTupleStreamTest, MultiTupleManyBufferSpill) {
-  int buffer_size = 128 * sizeof(int);
-  Init(10 * buffer_size);
-
-  TestValues<int>(0, int_desc_, false, true, buffer_size);
-  TestValues<int>(1, int_desc_, false, true, buffer_size);
-  TestValues<int>(10, int_desc_, false, true, buffer_size);
-  TestValues<int>(100, int_desc_, false, true, buffer_size);
-
-  TestValues<StringValue>(0, string_desc_, false, true, buffer_size);
-  TestValues<StringValue>(1, string_desc_, false, true, buffer_size);
-  TestValues<StringValue>(10, string_desc_, false, true, buffer_size);
-  TestValues<StringValue>(100, string_desc_, false, true, buffer_size);
-
-  TestIntValuesInterleaved(1, 1, true, buffer_size);
-  TestIntValuesInterleaved(10, 5, true, buffer_size);
-  TestIntValuesInterleaved(100, 15, true, buffer_size);
-}
-
-// Test that we can allocate a row in the stream and copy in multiple tuples then
-// read it back from the stream.
-TEST_F(MultiTupleStreamTest, MultiTupleAddRowCustom) {
-  // Use small buffers so it will be flushed to disk.
-  int buffer_size = 4 * 1024;
-  Init(2 * buffer_size);
-  Status status = Status::OK();
-
-  int num_batches = 1;
-  int rows_added = 0;
-  BufferedTupleStreamV2 stream(
-      runtime_state_, string_desc_, &client_, buffer_size, buffer_size);
-  ASSERT_OK(stream.Init(-1, false));
-  bool got_write_reservation;
-  ASSERT_OK(stream.PrepareForWrite(&got_write_reservation));
-  ASSERT_TRUE(got_write_reservation);
-
-  for (int i = 0; i < num_batches; ++i) {
-    RowBatch* batch = CreateStringBatch(rows_added, 1, false);
-    for (int j = 0; j < batch->num_rows(); ++j) {
-      TupleRow* row = batch->GetRow(j);
-      int64_t fixed_size = 0;
-      int64_t varlen_size = 0;
-      for (int k = 0; k < string_desc_->tuple_descriptors().size(); k++) {
-        TupleDescriptor* tuple_desc = string_desc_->tuple_descriptors()[k];
-        fixed_size += tuple_desc->byte_size();
-        varlen_size += row->GetTuple(k)->VarlenByteSize(*tuple_desc);
-      }
-      uint8_t* data = stream.AddRowCustomBegin(fixed_size + varlen_size, &status);
-      ASSERT_TRUE(data != nullptr);
-      ASSERT_TRUE(status.ok());
-      WriteStringRow(string_desc_, row, fixed_size, varlen_size, data);
-      stream.AddRowCustomEnd(fixed_size + varlen_size);
-    }
-    rows_added += batch->num_rows();
-  }
-
-  for (int i = 0; i < 3; ++i) {
-    bool delete_on_read = i == 2;
-    vector<StringValue> results;
-    bool got_read_reservation;
-    ASSERT_OK(stream.PrepareForRead(delete_on_read, &got_read_reservation));
-    ASSERT_TRUE(got_read_reservation);
-    ReadValues(&stream, string_desc_, &results);
-    VerifyResults<StringValue>(*string_desc_, results, rows_added, false);
-  }
-
-  stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-void SimpleTupleStreamTest::WriteStringRow(const RowDescriptor* row_desc, TupleRow* row,
-    int64_t fixed_size, int64_t varlen_size, uint8_t* data) {
-  uint8_t* fixed_data = data;
-  uint8_t* varlen_write_ptr = data + fixed_size;
-  for (int i = 0; i < row_desc->tuple_descriptors().size(); i++) {
-    TupleDescriptor* tuple_desc = row_desc->tuple_descriptors()[i];
-    Tuple* src = row->GetTuple(i);
-    Tuple* dst = reinterpret_cast<Tuple*>(fixed_data);
-    fixed_data += tuple_desc->byte_size();
-    memcpy(dst, src, tuple_desc->byte_size());
-    for (SlotDescriptor* slot : tuple_desc->slots()) {
-      StringValue* src_string = src->GetStringSlot(slot->tuple_offset());
-      StringValue* dst_string = dst->GetStringSlot(slot->tuple_offset());
-      dst_string->ptr = reinterpret_cast<char*>(varlen_write_ptr);
-      memcpy(dst_string->ptr, src_string->ptr, src_string->len);
-      varlen_write_ptr += src_string->len;
-    }
-  }
-  ASSERT_EQ(data + fixed_size + varlen_size, varlen_write_ptr);
-}
-
-// Test with rows with multiple nullable tuples.
-TEST_F(MultiNullableTupleStreamTest, MultiNullableTupleOneBufferSpill) {
-  // Each buffer can only hold 128 ints, so this spills quite often.
-  int buffer_size = 128 * sizeof(int);
-  Init(buffer_size);
-  TestValues<int>(0, int_desc_, false, true, buffer_size);
-  TestValues<int>(1, int_desc_, false, true, buffer_size);
-  TestValues<int>(10, int_desc_, false, true, buffer_size);
-  TestValues<int>(0, int_desc_, true, true, buffer_size);
-  TestValues<int>(1, int_desc_, true, true, buffer_size);
-  TestValues<int>(10, int_desc_, true, true, buffer_size);
-
-  TestValues<StringValue>(0, string_desc_, false, true, buffer_size);
-  TestValues<StringValue>(1, string_desc_, false, true, buffer_size);
-  TestValues<StringValue>(10, string_desc_, false, true, buffer_size);
-  TestValues<StringValue>(0, string_desc_, true, true, buffer_size);
-  TestValues<StringValue>(1, string_desc_, true, true, buffer_size);
-  TestValues<StringValue>(10, string_desc_, true, true, buffer_size);
-}
-
-// Test with a few buffers.
-TEST_F(MultiNullableTupleStreamTest, MultiNullableTupleManyBufferSpill) {
-  int buffer_size = 128 * sizeof(int);
-  Init(10 * buffer_size);
-
-  TestValues<int>(0, int_desc_, false, true, buffer_size);
-  TestValues<int>(1, int_desc_, false, true, buffer_size);
-  TestValues<int>(10, int_desc_, false, true, buffer_size);
-  TestValues<int>(100, int_desc_, false, true, buffer_size);
-  TestValues<int>(0, int_desc_, true, true, buffer_size);
-  TestValues<int>(1, int_desc_, true, true, buffer_size);
-  TestValues<int>(10, int_desc_, true, true, buffer_size);
-  TestValues<int>(100, int_desc_, true, true, buffer_size);
-
-  TestValues<StringValue>(0, string_desc_, false, true, buffer_size);
-  TestValues<StringValue>(1, string_desc_, false, true, buffer_size);
-  TestValues<StringValue>(10, string_desc_, false, true, buffer_size);
-  TestValues<StringValue>(100, string_desc_, false, true, buffer_size);
-  TestValues<StringValue>(0, string_desc_, true, true, buffer_size);
-  TestValues<StringValue>(1, string_desc_, true, true, buffer_size);
-  TestValues<StringValue>(10, string_desc_, true, true, buffer_size);
-  TestValues<StringValue>(100, string_desc_, true, true, buffer_size);
-
-  TestIntValuesInterleaved(0, 1, true, buffer_size);
-  TestIntValuesInterleaved(1, 1, true, buffer_size);
-  TestIntValuesInterleaved(10, 5, true, buffer_size);
-  TestIntValuesInterleaved(100, 15, true, buffer_size);
-}
-
-/// Test that ComputeRowSize handles nulls
-TEST_F(MultiNullableTupleStreamTest, TestComputeRowSize) {
-  Init(BUFFER_POOL_LIMIT);
-  const vector<TupleDescriptor*>& tuple_descs = string_desc_->tuple_descriptors();
-  // String in second tuple is stored externally.
-  set<SlotId> external_slots;
-  const SlotDescriptor* external_string_slot = tuple_descs[1]->slots()[0];
-  external_slots.insert(external_string_slot->id());
-
-  BufferedTupleStreamV2 stream(
-      runtime_state_, string_desc_, &client_, PAGE_LEN, PAGE_LEN, external_slots);
-  gscoped_ptr<TupleRow, FreeDeleter> row(
-      reinterpret_cast<TupleRow*>(malloc(tuple_descs.size() * sizeof(Tuple*))));
-  gscoped_ptr<Tuple, FreeDeleter> tuple0(
-      reinterpret_cast<Tuple*>(malloc(tuple_descs[0]->byte_size())));
-  gscoped_ptr<Tuple, FreeDeleter> tuple1(
-      reinterpret_cast<Tuple*>(malloc(tuple_descs[1]->byte_size())));
-  gscoped_ptr<Tuple, FreeDeleter> tuple2(
-      reinterpret_cast<Tuple*>(malloc(tuple_descs[2]->byte_size())));
-  memset(tuple0.get(), 0, tuple_descs[0]->byte_size());
-  memset(tuple1.get(), 0, tuple_descs[1]->byte_size());
-  memset(tuple2.get(), 0, tuple_descs[2]->byte_size());
-  const int tuple_null_indicator_bytes = 1; // Need 1 bytes for 3 tuples.
-
-  // All nullable tuples are NULL.
-  row->SetTuple(0, tuple0.get());
-  row->SetTuple(1, nullptr);
-  row->SetTuple(2, nullptr);
-  EXPECT_EQ(tuple_null_indicator_bytes + tuple_descs[0]->byte_size(),
-      stream.ComputeRowSize(row.get()));
-
-  // Tuples are initialized to empty and have no var-len data.
-  row->SetTuple(1, tuple1.get());
-  row->SetTuple(2, tuple2.get());
-  EXPECT_EQ(tuple_null_indicator_bytes + string_desc_->GetRowSize(),
-      stream.ComputeRowSize(row.get()));
-
-  // Tuple 0 has some data.
-  const SlotDescriptor* string_slot = tuple_descs[0]->slots()[0];
-  StringValue* sv = tuple0->GetStringSlot(string_slot->tuple_offset());
-  *sv = STRINGS[0];
-  int64_t expected_len =
-      tuple_null_indicator_bytes + string_desc_->GetRowSize() + sv->len;
-  EXPECT_EQ(expected_len, stream.ComputeRowSize(row.get()));
-
-  // Check that external slots aren't included in count.
-  sv = tuple1->GetStringSlot(external_string_slot->tuple_offset());
-  sv->ptr = reinterpret_cast<char*>(1234);
-  sv->len = 1234;
-  EXPECT_EQ(expected_len, stream.ComputeRowSize(row.get()));
-
-  stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-/// Test that deep copy works with arrays by copying into a BufferedTupleStream, freeing
-/// the original rows, then reading back the rows and verifying the contents.
-TEST_F(ArrayTupleStreamTest, TestArrayDeepCopy) {
-  Status status;
-  Init(BUFFER_POOL_LIMIT);
-  const int NUM_ROWS = 4000;
-  BufferedTupleStreamV2 stream(runtime_state_, array_desc_, &client_, PAGE_LEN, PAGE_LEN);
-  const vector<TupleDescriptor*>& tuple_descs = array_desc_->tuple_descriptors();
-  // Write out a predictable pattern of data by iterating over arrays of constants.
-  int strings_index = 0; // we take the mod of this as index into STRINGS.
-  int array_lens[] = {0, 1, 5, 10, 1000, 2, 49, 20};
-  int num_array_lens = sizeof(array_lens) / sizeof(array_lens[0]);
-  int array_len_index = 0;
-  ASSERT_OK(stream.Init(-1, false));
-  bool got_write_reservation;
-  ASSERT_OK(stream.PrepareForWrite(&got_write_reservation));
-  ASSERT_TRUE(got_write_reservation);
-
-  for (int i = 0; i < NUM_ROWS; ++i) {
-    const int tuple_null_indicator_bytes = 1; // Need 1 bytes for 2 tuples.
-    int expected_row_size = tuple_null_indicator_bytes + tuple_descs[0]->byte_size()
-        + tuple_descs[1]->byte_size();
-    gscoped_ptr<TupleRow, FreeDeleter> row(
-        reinterpret_cast<TupleRow*>(malloc(tuple_descs.size() * sizeof(Tuple*))));
-    gscoped_ptr<Tuple, FreeDeleter> tuple0(
-        reinterpret_cast<Tuple*>(malloc(tuple_descs[0]->byte_size())));
-    gscoped_ptr<Tuple, FreeDeleter> tuple1(
-        reinterpret_cast<Tuple*>(malloc(tuple_descs[1]->byte_size())));
-    memset(tuple0.get(), 0, tuple_descs[0]->byte_size());
-    memset(tuple1.get(), 0, tuple_descs[1]->byte_size());
-    row->SetTuple(0, tuple0.get());
-    row->SetTuple(1, tuple1.get());
-
-    // Only array<string> is non-null.
-    tuple0->SetNull(tuple_descs[0]->slots()[1]->null_indicator_offset());
-    tuple1->SetNull(tuple_descs[1]->slots()[0]->null_indicator_offset());
-    const SlotDescriptor* array_slot_desc = tuple_descs[0]->slots()[0];
-    const TupleDescriptor* item_desc = array_slot_desc->collection_item_descriptor();
-
-    int array_len = array_lens[array_len_index++ % num_array_lens];
-    CollectionValue* cv = tuple0->GetCollectionSlot(array_slot_desc->tuple_offset());
-    cv->ptr = nullptr;
-    cv->num_tuples = 0;
-    CollectionValueBuilder builder(
-        cv, *item_desc, mem_pool_.get(), runtime_state_, array_len);
-    Tuple* array_data;
-    int num_rows;
-    builder.GetFreeMemory(&array_data, &num_rows);
-    expected_row_size += item_desc->byte_size() * array_len;
-
-    // Fill the array with pointers to our constant strings.
-    for (int j = 0; j < array_len; ++j) {
-      const StringValue* string = &STRINGS[strings_index++ % NUM_STRINGS];
-      array_data->SetNotNull(item_desc->slots()[0]->null_indicator_offset());
-      RawValue::Write(string, array_data, item_desc->slots()[0], mem_pool_.get());
-      array_data += item_desc->byte_size();
-      expected_row_size += string->len;
-    }
-    builder.CommitTuples(array_len);
-
-    // Check that internal row size computation gives correct result.
-    EXPECT_EQ(expected_row_size, stream.ComputeRowSize(row.get()));
-    bool b = stream.AddRow(row.get(), &status);
-    ASSERT_TRUE(b);
-    ASSERT_OK(status);
-    mem_pool_->FreeAll(); // Free data as soon as possible to smoke out issues.
-  }
-
-  // Read back and verify data.
-  bool got_read_reservation;
-  ASSERT_OK(stream.PrepareForRead(false, &got_read_reservation));
-  ASSERT_TRUE(got_read_reservation);
-  strings_index = 0;
-  array_len_index = 0;
-  bool eos = false;
-  int rows_read = 0;
-  RowBatch batch(array_desc_, BATCH_SIZE, &tracker_);
-  do {
-    batch.Reset();
-    ASSERT_OK(stream.GetNext(&batch, &eos));
-    for (int i = 0; i < batch.num_rows(); ++i) {
-      TupleRow* row = batch.GetRow(i);
-      Tuple* tuple0 = row->GetTuple(0);
-      Tuple* tuple1 = row->GetTuple(1);
-      ASSERT_TRUE(tuple0 != nullptr);
-      ASSERT_TRUE(tuple1 != nullptr);
-      const SlotDescriptor* array_slot_desc = tuple_descs[0]->slots()[0];
-      ASSERT_FALSE(tuple0->IsNull(array_slot_desc->null_indicator_offset()));
-      ASSERT_TRUE(tuple0->IsNull(tuple_descs[0]->slots()[1]->null_indicator_offset()));
-      ASSERT_TRUE(tuple1->IsNull(tuple_descs[1]->slots()[0]->null_indicator_offset()));
-
-      const TupleDescriptor* item_desc = array_slot_desc->collection_item_descriptor();
-      int expected_array_len = array_lens[array_len_index++ % num_array_lens];
-      CollectionValue* cv = tuple0->GetCollectionSlot(array_slot_desc->tuple_offset());
-      ASSERT_EQ(expected_array_len, cv->num_tuples);
-      for (int j = 0; j < cv->num_tuples; ++j) {
-        Tuple* item = reinterpret_cast<Tuple*>(cv->ptr + j * item_desc->byte_size());
-        const SlotDescriptor* string_desc = item_desc->slots()[0];
-        ASSERT_FALSE(item->IsNull(string_desc->null_indicator_offset()));
-        const StringValue* expected = &STRINGS[strings_index++ % NUM_STRINGS];
-        const StringValue* actual = item->GetStringSlot(string_desc->tuple_offset());
-        ASSERT_EQ(*expected, *actual);
-      }
-    }
-    rows_read += batch.num_rows();
-  } while (!eos);
-  ASSERT_EQ(NUM_ROWS, rows_read);
-  stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-/// Test that ComputeRowSize handles nulls
-TEST_F(ArrayTupleStreamTest, TestComputeRowSize) {
-  Init(BUFFER_POOL_LIMIT);
-  const vector<TupleDescriptor*>& tuple_descs = array_desc_->tuple_descriptors();
-  set<SlotId> external_slots;
-  // Second array slot in first tuple is stored externally.
-  const SlotDescriptor* external_array_slot = tuple_descs[0]->slots()[1];
-  external_slots.insert(external_array_slot->id());
-
-  BufferedTupleStreamV2 stream(
-      runtime_state_, array_desc_, &client_, PAGE_LEN, PAGE_LEN, external_slots);
-  gscoped_ptr<TupleRow, FreeDeleter> row(
-      reinterpret_cast<TupleRow*>(malloc(tuple_descs.size() * sizeof(Tuple*))));
-  gscoped_ptr<Tuple, FreeDeleter> tuple0(
-      reinterpret_cast<Tuple*>(malloc(tuple_descs[0]->byte_size())));
-  gscoped_ptr<Tuple, FreeDeleter> tuple1(
-      reinterpret_cast<Tuple*>(malloc(tuple_descs[1]->byte_size())));
-  memset(tuple0.get(), 0, tuple_descs[0]->byte_size());
-  memset(tuple1.get(), 0, tuple_descs[1]->byte_size());
-
-  const int tuple_null_indicator_bytes = 1; // Need 1 bytes for 3 tuples.
-
-  // All tuples are NULL - only need null indicators.
-  row->SetTuple(0, nullptr);
-  row->SetTuple(1, nullptr);
-  EXPECT_EQ(tuple_null_indicator_bytes, stream.ComputeRowSize(row.get()));
-
-  // Tuples are initialized to empty and have no var-len data.
-  row->SetTuple(0, tuple0.get());
-  row->SetTuple(1, tuple1.get());
-  EXPECT_EQ(tuple_null_indicator_bytes + array_desc_->GetRowSize(),
-      stream.ComputeRowSize(row.get()));
-
-  // Tuple 0 has an array.
-  int expected_row_size = tuple_null_indicator_bytes + array_desc_->GetRowSize();
-  const SlotDescriptor* array_slot = tuple_descs[0]->slots()[0];
-  const TupleDescriptor* item_desc = array_slot->collection_item_descriptor();
-  int array_len = 128;
-  CollectionValue* cv = tuple0->GetCollectionSlot(array_slot->tuple_offset());
-  CollectionValueBuilder builder(
-      cv, *item_desc, mem_pool_.get(), runtime_state_, array_len);
-  Tuple* array_data;
-  int num_rows;
-  builder.GetFreeMemory(&array_data, &num_rows);
-  expected_row_size += item_desc->byte_size() * array_len;
-
-  // Fill the array with pointers to our constant strings.
-  for (int i = 0; i < array_len; ++i) {
-    const StringValue* str = &STRINGS[i % NUM_STRINGS];
-    array_data->SetNotNull(item_desc->slots()[0]->null_indicator_offset());
-    RawValue::Write(str, array_data, item_desc->slots()[0], mem_pool_.get());
-    array_data += item_desc->byte_size();
-    expected_row_size += str->len;
-  }
-  builder.CommitTuples(array_len);
-  EXPECT_EQ(expected_row_size, stream.ComputeRowSize(row.get()));
-
-  // Check that the external slot isn't included in size.
-  cv = tuple0->GetCollectionSlot(external_array_slot->tuple_offset());
-  // ptr of external slot shouldn't be dereferenced when computing size.
-  cv->ptr = reinterpret_cast<uint8_t*>(1234);
-  cv->num_tuples = 1234;
-  EXPECT_EQ(expected_row_size, stream.ComputeRowSize(row.get()));
-
-  // Check that the array is excluded if tuple 0's array has its null indicator set.
-  tuple0->SetNull(array_slot->null_indicator_offset());
-  EXPECT_EQ(tuple_null_indicator_bytes + array_desc_->GetRowSize(),
-      stream.ComputeRowSize(row.get()));
-
-  stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-}
-
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  impala::InitCommonRuntime(argc, argv, true, impala::TestInfo::BE_TEST);
-  impala::InitFeSupport();
-  impala::LlvmCodeGen::InitializeLlvm();
-  return RUN_ALL_TESTS();
-}


[03/11] incubator-impala git commit: IMPALA-5774: Prevent FindInSet() from reading off end of string

Posted by ta...@apache.org.
IMPALA-5774: Prevent FindInSet() from reading off end of string

Change-Id: I541c8e6bb712e380f9610d6bfa35e2d515a31d1d
Reviewed-on: http://gerrit.cloudera.org:8080/7608
Reviewed-by: Sailesh Mukil <sa...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/5caadbbe
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/5caadbbe
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/5caadbbe

Branch: refs/heads/master
Commit: 5caadbbedd1917019937290e9427fd6f798f0cd8
Parents: b881fba
Author: Henry Robinson <he...@cloudera.com>
Authored: Mon Aug 7 12:35:54 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Mon Aug 7 23:41:45 2017 +0000

----------------------------------------------------------------------
 be/src/exprs/string-functions-ir.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5caadbbe/be/src/exprs/string-functions-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/string-functions-ir.cc b/be/src/exprs/string-functions-ir.cc
index 5c66541..d0c8679 100644
--- a/be/src/exprs/string-functions-ir.cc
+++ b/be/src/exprs/string-functions-ir.cc
@@ -814,7 +814,7 @@ IntVal StringFunctions::FindInSet(FunctionContext* context, const StringVal& str
   do {
     end = start;
     // Position end.
-    while(str_set.ptr[end] != ',' && end < str_set.len) ++end;
+    while (end < str_set.len && str_set.ptr[end] != ',') ++end;
     StringValue token(reinterpret_cast<char*>(str_set.ptr) + start, end - start);
     if (str_sv.Eq(token)) return IntVal(token_index);
 


[10/11] incubator-impala git commit: IMPALA-5696: Enable cipher configuration when using TLS / Thrift

Posted by ta...@apache.org.
IMPALA-5696: Enable cipher configuration when using TLS / Thrift

The 'cipher suite' is a description of the set of algorithms used by SSL
and TLS to execute key exchange, encryption, message authentication, and
random number generation functions. SSL implementations allow the cipher
suite to be configured so that ciphers may be removed from the whitelist
if they are shown to be weak.

* Add a flag --ssl_cipher_list which controls cipher selection for both
  thrift servers and clients. Default is blank, which means use all
  available cipher suites.
* Add ThriftServerBuilder to simplify construction of
  ThriftServers (whose constructors were otherwise getting very long).

Testing: new tests added to thrift-server-test. Test cases added follow:

* A client cannot connect to a server which does not have any ciphers in
  common with it.
* If ciphers are identical on clients and servers, that ssl connections
  can be made.
* Bad cipher strings lead to errors on both client and server.

Change-Id: I735ae36eebfdf7228f235686c9c69642c3c9d84f
Reviewed-on: http://gerrit.cloudera.org:8080/7524
Reviewed-by: Henry Robinson <he...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/68df21b4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/68df21b4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/68df21b4

Branch: refs/heads/master
Commit: 68df21b426feca8e7a458152d8dca1b7e1335bcb
Parents: d61065d
Author: Henry Robinson <he...@cloudera.com>
Authored: Fri Jul 21 14:46:13 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Tue Aug 8 10:33:31 2017 +0000

----------------------------------------------------------------------
 be/src/benchmarks/network-perf-benchmark.cc |   6 +-
 be/src/catalog/catalogd-main.cc             |  12 +-
 be/src/rpc/thrift-client.cc                 |   2 +
 be/src/rpc/thrift-server-test.cc            | 217 ++++++++++++++++++-----
 be/src/rpc/thrift-server.cc                 |   5 +-
 be/src/rpc/thrift-server.h                  | 149 +++++++++++++---
 be/src/runtime/data-stream-test.cc          |   2 +-
 be/src/service/impala-server.cc             |  52 ++++--
 be/src/statestore/statestore-subscriber.cc  |  16 +-
 be/src/statestore/statestored-main.cc       |  11 +-
 be/src/testutil/in-process-servers.cc       |   9 +-
 be/src/testutil/scoped-flag-setter.h        |  52 ++++++
 be/src/util/webserver-test.cc               |  50 ++----
 13 files changed, 440 insertions(+), 143 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/68df21b4/be/src/benchmarks/network-perf-benchmark.cc
----------------------------------------------------------------------
diff --git a/be/src/benchmarks/network-perf-benchmark.cc b/be/src/benchmarks/network-perf-benchmark.cc
index 26b2aae..6cebaaf 100644
--- a/be/src/benchmarks/network-perf-benchmark.cc
+++ b/be/src/benchmarks/network-perf-benchmark.cc
@@ -221,8 +221,10 @@ int main(int argc, char** argv) {
   boost::shared_ptr<ThreadFactory> thread_factory(
       new ThriftThreadFactory("test", "test"));
   boost::shared_ptr<TProcessor> processor(new NetworkTestServiceProcessor(handler));
-  ThriftServer* server = new ThriftServer("Network Test Server", processor,
-      FLAGS_port, NULL, NULL, 100, ThriftServer::ThreadPool);
+  ThriftServer* server;
+  ABORT_IF_ERROR(ThriftServerBuilder("Network Test Server", processor, FLAGS_port)
+                     .thread_pool(100)
+                     .Build(&server));
   thread* server_thread = new thread(&TestServer::Server, handler.get(), server);
 
   string input;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/68df21b4/be/src/catalog/catalogd-main.cc
----------------------------------------------------------------------
diff --git a/be/src/catalog/catalogd-main.cc b/be/src/catalog/catalogd-main.cc
index ae1cdf6..4847920 100644
--- a/be/src/catalog/catalogd-main.cc
+++ b/be/src/catalog/catalogd-main.cc
@@ -47,6 +47,7 @@ DECLARE_int32(state_store_subscriber_port);
 DECLARE_string(ssl_server_certificate);
 DECLARE_string(ssl_private_key);
 DECLARE_string(ssl_private_key_password_cmd);
+DECLARE_string(ssl_cipher_list);
 
 #include "common/names.h"
 
@@ -89,13 +90,16 @@ int CatalogdMain(int argc, char** argv) {
       new RpcEventHandler("catalog-server", metrics.get()));
   processor->setEventHandler(event_handler);
 
-  ThriftServer* server = new ThriftServer("CatalogService", processor,
-      FLAGS_catalog_service_port, NULL, metrics.get(), 5);
+  ThriftServer* server;
+  ThriftServerBuilder builder("CatalogService", processor, FLAGS_catalog_service_port);
+
   if (EnableInternalSslConnections()) {
     LOG(INFO) << "Enabling SSL for CatalogService";
-    ABORT_IF_ERROR(server->EnableSsl(FLAGS_ssl_server_certificate, FLAGS_ssl_private_key,
-        FLAGS_ssl_private_key_password_cmd));
+    builder.ssl(FLAGS_ssl_server_certificate, FLAGS_ssl_private_key)
+        .pem_password_cmd(FLAGS_ssl_private_key_password_cmd)
+        .cipher_list(FLAGS_ssl_cipher_list);
   }
+  ABORT_IF_ERROR(builder.metrics(metrics.get()).Build(&server));
   ABORT_IF_ERROR(server->Start());
   LOG(INFO) << "CatalogService started on port: " << FLAGS_catalog_service_port;
   server->Join();

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/68df21b4/be/src/rpc/thrift-client.cc
----------------------------------------------------------------------
diff --git a/be/src/rpc/thrift-client.cc b/be/src/rpc/thrift-client.cc
index e0b9a6f..6f01073 100644
--- a/be/src/rpc/thrift-client.cc
+++ b/be/src/rpc/thrift-client.cc
@@ -32,6 +32,7 @@ using namespace apache::thrift;
 using namespace strings;
 
 DECLARE_string(ssl_client_ca_certificate);
+DECLARE_string(ssl_cipher_list);
 
 namespace impala {
 
@@ -100,6 +101,7 @@ Status ThriftClientImpl::CreateSocket() {
     socket_.reset(new TSocket(address_.hostname, address_.port));
   } else {
     try {
+      if (!FLAGS_ssl_cipher_list.empty()) ssl_factory_->ciphers(FLAGS_ssl_cipher_list);
       ssl_factory_->loadTrustedCertificates(FLAGS_ssl_client_ca_certificate.c_str());
       socket_ = ssl_factory_->createSocket(address_.hostname, address_.port);
     } catch (const TException& e) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/68df21b4/be/src/rpc/thrift-server-test.cc
----------------------------------------------------------------------
diff --git a/be/src/rpc/thrift-server-test.cc b/be/src/rpc/thrift-server-test.cc
index f6edfb2..f7a2916 100644
--- a/be/src/rpc/thrift-server-test.cc
+++ b/be/src/rpc/thrift-server-test.cc
@@ -17,12 +17,13 @@
 
 #include <string>
 
-#include "testutil/gtest-util.h"
+#include "gen-cpp/StatestoreService.h"
+#include "gutil/strings/substitute.h"
 #include "rpc/thrift-client.h"
 #include "service/fe-support.h"
 #include "service/impala-server.h"
-#include "gen-cpp/StatestoreService.h"
-#include "gutil/strings/substitute.h"
+#include "testutil/gtest-util.h"
+#include "testutil/scoped-flag-setter.h"
 
 #include "common/names.h"
 
@@ -31,6 +32,7 @@ using namespace strings;
 using namespace apache::thrift;
 
 DECLARE_string(ssl_client_ca_certificate);
+DECLARE_string(ssl_cipher_list);
 
 DECLARE_int32(state_store_port);
 
@@ -71,12 +73,12 @@ int GetServerPort() {
 
 TEST(ThriftServer, Connectivity) {
   int port = GetServerPort();
-  ThriftClient<StatestoreServiceClientWrapper> wrong_port_client("localhost",
-      port, "", NULL, false);
+  ThriftClient<StatestoreServiceClientWrapper> wrong_port_client(
+      "localhost", port, "", nullptr, false);
   ASSERT_FALSE(wrong_port_client.Open().ok());
 
-  ThriftServer* server =
-      new ThriftServer("DummyStatestore", MakeProcessor(), port, NULL, NULL, 5);
+  ThriftServer* server;
+  EXPECT_OK(ThriftServerBuilder("DummyStatestore", MakeProcessor(), port).Build(&server));
   ASSERT_OK(server->Start());
 
   // Test that client recovers from failure to connect.
@@ -89,14 +91,15 @@ TEST(SslTest, Connectivity) {
   // client cannot.
   // Here and elsewhere - allocate ThriftServers on the heap to avoid race during
   // destruction. See IMPALA-2283.
-  ThriftServer* server = new ThriftServer("DummyStatestore", MakeProcessor(), port, NULL,
-      NULL, 5);
-  ASSERT_OK(server->EnableSsl(SERVER_CERT, PRIVATE_KEY, "echo password"));
+  ThriftServer* server;
+  EXPECT_OK(ThriftServerBuilder("DummyStatestore", MakeProcessor(), port)
+                .ssl(SERVER_CERT, PRIVATE_KEY)
+                .Build(&server));
   ASSERT_OK(server->Start());
 
   FLAGS_ssl_client_ca_certificate = SERVER_CERT;
   ThriftClient<StatestoreServiceClientWrapper> ssl_client(
-      "localhost", port, "", NULL, true);
+      "localhost", port, "", nullptr, true);
   ASSERT_OK(ssl_client.Open());
   TRegisterSubscriberResponse resp;
   bool send_done = false;
@@ -106,7 +109,7 @@ TEST(SslTest, Connectivity) {
 
   // Disable SSL for this client.
   ThriftClient<StatestoreServiceClientWrapper> non_ssl_client(
-      "localhost", port, "", NULL, false);
+      "localhost", port, "", nullptr, false);
   ASSERT_OK(non_ssl_client.Open());
   send_done = false;
   EXPECT_THROW(non_ssl_client.iface()->RegisterSubscriber(
@@ -116,13 +119,14 @@ TEST(SslTest, Connectivity) {
 TEST(SslTest, BadCertificate) {
   FLAGS_ssl_client_ca_certificate = "unknown";
   int port = GetServerPort();
-  ThriftClient<StatestoreServiceClientWrapper>
-      ssl_client("localhost", port, "", NULL, true);
+  ThriftClient<StatestoreServiceClientWrapper> ssl_client(
+      "localhost", port, "", nullptr, true);
   ASSERT_FALSE(ssl_client.Open().ok());
 
-  ThriftServer* server =
-      new ThriftServer("DummyStatestore", MakeProcessor(), port, NULL, NULL, 5);
-  ASSERT_OK(server->EnableSsl(SERVER_CERT, PRIVATE_KEY, "echo password"));
+  ThriftServer* server;
+  EXPECT_OK(ThriftServerBuilder("DummyStatestore", MakeProcessor(), port)
+                .ssl(SERVER_CERT, PRIVATE_KEY)
+                .Build(&server));
   ASSERT_OK(server->Start());
 
   // Check that client does not recover from failure to create socket.
@@ -133,14 +137,16 @@ TEST(PasswordProtectedPemFile, CorrectOperation) {
   // Require the server to execute a shell command to read the password to the private key
   // file.
   int port = GetServerPort();
-  ThriftServer* server = new ThriftServer("DummyStatestore", MakeProcessor(), port, NULL,
-      NULL, 5);
-  ASSERT_OK(server->EnableSsl(
-      SERVER_CERT, PASSWORD_PROTECTED_PRIVATE_KEY, "echo password"));
+  ThriftServer* server;
+  EXPECT_OK(ThriftServerBuilder("DummyStatestore", MakeProcessor(), port)
+                .ssl(SERVER_CERT, PASSWORD_PROTECTED_PRIVATE_KEY)
+                .pem_password_cmd("echo password")
+                .Build(&server));
   ASSERT_OK(server->Start());
-  FLAGS_ssl_client_ca_certificate = SERVER_CERT;
-  ThriftClient<StatestoreServiceClientWrapper>
-      ssl_client("localhost", port, "", NULL, true);
+
+  auto s = ScopedFlagSetter<string>::Make(&FLAGS_ssl_client_ca_certificate, SERVER_CERT);
+  ThriftClient<StatestoreServiceClientWrapper> ssl_client(
+      "localhost", port, "", nullptr, true);
   ASSERT_OK(ssl_client.Open());
   TRegisterSubscriberResponse resp;
   bool send_done = false;
@@ -150,28 +156,38 @@ TEST(PasswordProtectedPemFile, CorrectOperation) {
 
 TEST(PasswordProtectedPemFile, BadPassword) {
   // Test failure when password to private key is wrong.
-  ThriftServer server("DummyStatestore", MakeProcessor(), GetServerPort(), NULL, NULL, 5);
-  ASSERT_OK(server.EnableSsl(
-      SERVER_CERT, PASSWORD_PROTECTED_PRIVATE_KEY, "echo wrongpassword"));
-  EXPECT_FALSE(server.Start().ok());
+  ThriftServer* server;
+  EXPECT_OK(ThriftServerBuilder("DummyStatestore", MakeProcessor(), GetServerPort())
+                .ssl(SERVER_CERT, PASSWORD_PROTECTED_PRIVATE_KEY)
+                .pem_password_cmd("echo wrongpassword")
+                .Build(&server));
+  EXPECT_FALSE(server->Start().ok());
 }
 
 TEST(PasswordProtectedPemFile, BadCommand) {
   // Test failure when password command is badly formed.
-  ThriftServer server("DummyStatestore", MakeProcessor(), GetServerPort(), NULL, NULL, 5);
-  EXPECT_FALSE(server.EnableSsl(
-      SERVER_CERT, PASSWORD_PROTECTED_PRIVATE_KEY, "cmd-no-exist").ok());
+  ThriftServer* server;
+
+  // Keep clang-tdy happy - NOLINT (which here is due to deliberately leaked 'server')
+  // does not get pushed into EXPECT_ERROR.
+  Status s = ThriftServerBuilder("DummyStatestore", MakeProcessor(), GetServerPort()) // NOLINT
+      .ssl(SERVER_CERT, PASSWORD_PROTECTED_PRIVATE_KEY)
+      .pem_password_cmd("cmd-no-exist")
+      .Build(&server);
+  EXPECT_ERROR(s, TErrorCode::SSL_PASSWORD_CMD_FAILED);
 }
 
 TEST(SslTest, ClientBeforeServer) {
   // Instantiate a thrift client before a thrift server and test if it works (IMPALA-2747)
-  FLAGS_ssl_client_ca_certificate = SERVER_CERT;
+  auto s = ScopedFlagSetter<string>::Make(&FLAGS_ssl_client_ca_certificate, SERVER_CERT);
   int port = GetServerPort();
-  ThriftClient<StatestoreServiceClientWrapper>
-      ssl_client("localhost", port, "", NULL, true);
-  ThriftServer* server =
-      new ThriftServer("DummyStatestore", MakeProcessor(), port, NULL, NULL, 5);
-  ASSERT_OK(server->EnableSsl(SERVER_CERT, PRIVATE_KEY));
+  ThriftClient<StatestoreServiceClientWrapper> ssl_client(
+      "localhost", port, "", nullptr, true);
+
+  ThriftServer* server;
+  EXPECT_OK(ThriftServerBuilder("DummyStatestore", MakeProcessor(), port)
+                .ssl(SERVER_CERT, PRIVATE_KEY)
+                .Build(&server));
   ASSERT_OK(server->Start());
 
   ASSERT_OK(ssl_client.Open());
@@ -180,6 +196,113 @@ TEST(SslTest, ClientBeforeServer) {
   ssl_client.iface()->RegisterSubscriber(resp, TRegisterSubscriberRequest(), &send_done);
 }
 
+TEST(SslTest, BadCiphers) {
+  int port = GetServerPort();
+  {
+    ThriftServer* server;
+    EXPECT_OK(ThriftServerBuilder("DummyStatestore", MakeProcessor(), port)
+                  .ssl(SERVER_CERT, PRIVATE_KEY)
+                  .cipher_list("this_is_not_a_cipher")
+                  .Build(&server));
+    EXPECT_FALSE(server->Start().ok());
+  }
+
+  {
+    ThriftServer* server;
+    EXPECT_OK(ThriftServerBuilder("DummyStatestore", MakeProcessor(), port)
+                  .ssl(SERVER_CERT, PRIVATE_KEY)
+                  .Build(&server));
+    EXPECT_OK(server->Start());
+
+    auto s1 =
+        ScopedFlagSetter<string>::Make(&FLAGS_ssl_cipher_list, "this_is_not_a_cipher");
+    auto s2 =
+        ScopedFlagSetter<string>::Make(&FLAGS_ssl_client_ca_certificate, SERVER_CERT);
+
+    ThriftClient<StatestoreServiceClientWrapper> ssl_client(
+        "localhost", port, "", nullptr, true);
+    EXPECT_FALSE(ssl_client.Open().ok());
+  }
+}
+
+TEST(SslTest, MismatchedCiphers) {
+  int port = GetServerPort();
+  FLAGS_ssl_client_ca_certificate = SERVER_CERT;
+
+  ThriftServer* server;
+  EXPECT_OK(ThriftServerBuilder("DummyStatestore", MakeProcessor(), port)
+                .ssl(SERVER_CERT, PASSWORD_PROTECTED_PRIVATE_KEY)
+                .pem_password_cmd("echo password")
+                .cipher_list("AES256-SHA256")
+                .Build(&server));
+  EXPECT_OK(server->Start());
+
+  auto s = ScopedFlagSetter<string>::Make(&FLAGS_ssl_cipher_list, "RC4-SHA");
+  ThriftClient<StatestoreServiceClientWrapper> ssl_client(
+      "localhost", port, "", nullptr, true);
+
+  // Failure to negotiate a cipher will show up when data is sent, not when socket is
+  // opened.
+  EXPECT_OK(ssl_client.Open());
+
+  bool send_done = false;
+  TRegisterSubscriberResponse resp;
+  EXPECT_THROW(ssl_client.iface()->RegisterSubscriber(
+                   resp, TRegisterSubscriberRequest(), &send_done),
+      TTransportException);
+}
+
+TEST(SslTest, MatchedCiphers) {
+  int port = GetServerPort();
+  ThriftServer* server;
+  EXPECT_OK(ThriftServerBuilder("DummyStatestore", MakeProcessor(), port)
+                .ssl(SERVER_CERT, PASSWORD_PROTECTED_PRIVATE_KEY)
+                .pem_password_cmd("echo password")
+                .cipher_list("AES256-SHA256")
+                .Build(&server));
+  EXPECT_OK(server->Start());
+
+  FLAGS_ssl_client_ca_certificate = SERVER_CERT;
+  auto s = ScopedFlagSetter<string>::Make(&FLAGS_ssl_cipher_list, "AES256-SHA256");
+  ThriftClient<StatestoreServiceClientWrapper> ssl_client(
+      "localhost", port, "", nullptr, true);
+
+  EXPECT_OK(ssl_client.Open());
+
+  bool send_done = false;
+  TRegisterSubscriberResponse resp;
+  EXPECT_NO_THROW({
+    ssl_client.iface()->RegisterSubscriber(
+        resp, TRegisterSubscriberRequest(), &send_done);
+  });
+}
+
+TEST(SslTest, OverlappingMatchedCiphers) {
+  int port = GetServerPort();
+  ThriftServer* server;
+  EXPECT_OK(ThriftServerBuilder("DummyStatestore", MakeProcessor(), port)
+      .ssl(SERVER_CERT, PASSWORD_PROTECTED_PRIVATE_KEY)
+      .pem_password_cmd("echo password")
+      .cipher_list("RC4-SHA,AES256-SHA256")
+      .Build(&server));
+  EXPECT_OK(server->Start());
+
+  FLAGS_ssl_client_ca_certificate = SERVER_CERT;
+  auto s = ScopedFlagSetter<string>::Make(&FLAGS_ssl_cipher_list,
+      "AES256-SHA256,not-a-cipher");
+  ThriftClient<StatestoreServiceClientWrapper> ssl_client(
+      "localhost", port, "", nullptr, true);
+
+  EXPECT_OK(ssl_client.Open());
+
+  bool send_done = false;
+  TRegisterSubscriberResponse resp;
+  EXPECT_NO_THROW({
+        ssl_client.iface()->RegisterSubscriber(
+            resp, TRegisterSubscriberRequest(), &send_done);
+      });
+}
+
 /// Test disabled because requires a high ulimit -n on build machines. Since the test does
 /// not always fail, we don't lose much coverage by disabling it until we fix the build
 /// infra issue.
@@ -189,13 +312,14 @@ TEST(ConcurrencyTest, DISABLED_ManyConcurrentConnections) {
   // Note that without the fix for IMPALA-4135, this test won't always fail, depending on
   // the hardware that it is run on.
   int port = GetServerPort();
-  ThriftServer* server = new ThriftServer("DummyServer", MakeProcessor(), port);
+  ThriftServer* server;
+  EXPECT_OK(ThriftServerBuilder("DummyServer", MakeProcessor(), port).Build(&server));
   ASSERT_OK(server->Start());
 
   ThreadPool<int64_t> pool(
       "group", "test", 256, 10000, [port](int tid, const int64_t& item) {
         using Client = ThriftClient<ImpalaInternalServiceClient>;
-        Client* client = new Client("127.0.0.1", port, "", NULL, false);
+        Client* client = new Client("127.0.0.1", port, "", nullptr, false);
         Status status = client->Open();
         ASSERT_OK(status);
       });
@@ -204,13 +328,16 @@ TEST(ConcurrencyTest, DISABLED_ManyConcurrentConnections) {
 }
 
 TEST(NoPasswordPemFile, BadServerCertificate) {
-  ThriftServer* server = new ThriftServer("DummyStatestore", MakeProcessor(),
-      FLAGS_state_store_port + 5, NULL, NULL, 5);
-  EXPECT_OK(server->EnableSsl(BAD_SERVER_CERT, BAD_PRIVATE_KEY));
-  EXPECT_OK(server->Start());
-  FLAGS_ssl_client_ca_certificate = SERVER_CERT;
+  int port = GetServerPort();
+  ThriftServer* server;
+  EXPECT_OK(ThriftServerBuilder("DummyStatestore", MakeProcessor(), port)
+                .ssl(BAD_SERVER_CERT, BAD_PRIVATE_KEY)
+                .Build(&server));
+  ASSERT_OK(server->Start());
+
+  auto s = ScopedFlagSetter<string>::Make(&FLAGS_ssl_client_ca_certificate, SERVER_CERT);
   ThriftClient<StatestoreServiceClientWrapper> ssl_client(
-      "localhost", FLAGS_state_store_port + 5, "", NULL, true);
+      "localhost", port, "", nullptr, true);
   EXPECT_OK(ssl_client.Open());
   TRegisterSubscriberResponse resp;
   bool send_done = false;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/68df21b4/be/src/rpc/thrift-server.cc
----------------------------------------------------------------------
diff --git a/be/src/rpc/thrift-server.cc b/be/src/rpc/thrift-server.cc
index a0a86a2..e1e9a7f 100644
--- a/be/src/rpc/thrift-server.cc
+++ b/be/src/rpc/thrift-server.cc
@@ -66,6 +66,7 @@ DECLARE_string(principal);
 DECLARE_string(keytab_file);
 DECLARE_string(ssl_client_ca_certificate);
 DECLARE_string(ssl_server_certificate);
+DECLARE_string(ssl_cipher_list);
 
 namespace impala {
 
@@ -352,6 +353,7 @@ Status ThriftServer::CreateSocket(boost::shared_ptr<TServerTransport>* socket) {
         new ImpalaSslSocketFactory(key_password_));
     socket_factory->overrideDefaultPasswordCallback();
     try {
+      if (!cipher_list_.empty()) socket_factory->ciphers(cipher_list_);
       socket_factory->loadCertificate(certificate_path_.c_str());
       socket_factory->loadPrivateKey(private_key_path_.c_str());
       socket->reset(new TSSLServerSocket(port_, socket_factory));
@@ -366,7 +368,7 @@ Status ThriftServer::CreateSocket(boost::shared_ptr<TServerTransport>* socket) {
 }
 
 Status ThriftServer::EnableSsl(const string& certificate, const string& private_key,
-    const string& pem_password_cmd) {
+    const string& pem_password_cmd, const std::string& ciphers) {
   DCHECK(!started_);
   if (certificate.empty()) return Status(TErrorCode::SSL_CERTIFICATE_PATH_BLANK);
   if (private_key.empty()) return Status(TErrorCode::SSL_PRIVATE_KEY_PATH_BLANK);
@@ -383,6 +385,7 @@ Status ThriftServer::EnableSsl(const string& certificate, const string& private_
   ssl_enabled_ = true;
   certificate_path_ = certificate;
   private_key_path_ = private_key;
+  cipher_list_ = ciphers;
 
   if (!pem_password_cmd.empty()) {
     if (!RunShellProcess(pem_password_cmd, &key_password_, true)) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/68df21b4/be/src/rpc/thrift-server.h
----------------------------------------------------------------------
diff --git a/be/src/rpc/thrift-server.h b/be/src/rpc/thrift-server.h
index 5b85134..e52edea 100644
--- a/be/src/rpc/thrift-server.h
+++ b/be/src/rpc/thrift-server.h
@@ -15,7 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-
 #ifndef IMPALA_RPC_THRIFT_SERVER_H
 #define IMPALA_RPC_THRIFT_SERVER_H
 
@@ -37,8 +36,11 @@ namespace impala {
 /// Utility class for all Thrift servers. Runs a threaded server by default, or a
 /// TThreadPoolServer with, by default, 2 worker threads, that exposes the interface
 /// described by a user-supplied TProcessor object.
+///
+/// Use a ThriftServerBuilder to construct a ThriftServer. ThriftServer's c'tors are
+/// private.
+///
 /// If TThreadPoolServer is used, client must use TSocket as transport.
-/// TODO: Need a builder to help with the unwieldy constructor
 /// TODO: shutdown is buggy (which only harms tests)
 class ThriftServer {
  public:
@@ -95,30 +97,6 @@ class ThriftServer {
   /// Threaded    -- Allocates 1 thread per connection, as needed.
   enum ServerType { ThreadPool = 0, Threaded };
 
-  /// Creates, but does not start, a new server on the specified port
-  /// that exports the supplied interface.
-  ///  - name: human-readable name of this server. Should not contain spaces
-  ///  - processor: Thrift processor to handle RPCs
-  ///  - port: The port the server will listen for connections on
-  ///  - auth_provider: Authentication scheme to use. If NULL, use the global default
-  ///    demon<->demon provider.
-  ///  - metrics: if not NULL, the server will register metrics on this object
-  ///  - num_worker_threads: the number of worker threads to use in any thread pool
-  ///  - server_type: the type of IO strategy this server should employ
-  ThriftServer(const std::string& name,
-      const boost::shared_ptr<apache::thrift::TProcessor>& processor, int port,
-      AuthProvider* auth_provider = NULL, MetricGroup* metrics = NULL,
-      int num_worker_threads = DEFAULT_WORKER_THREADS, ServerType server_type = Threaded);
-
-  /// Enables secure access over SSL. Must be called before Start(). The first two
-  /// arguments are paths to certificate and private key files in .PEM format,
-  /// respectively. If either file does not exist, an error is returned. The final
-  /// optional argument provides the command to run if a password is required to decrypt
-  /// the private key. It is invoked once, and the resulting password is used only for
-  /// password-protected .PEM files.
-  Status EnableSsl(const std::string& certificate, const std::string& private_key,
-      const std::string& pem_password_cmd = "");
-
   int port() const { return port_; }
 
   bool ssl_enabled() const { return ssl_enabled_; }
@@ -161,6 +139,32 @@ class ThriftServer {
   static const ConnectionContext* GetThreadConnectionContext();
 
  private:
+  friend class ThriftServerBuilder;
+
+  /// Creates, but does not start, a new server on the specified port
+  /// that exports the supplied interface.
+  ///  - name: human-readable name of this server. Should not contain spaces
+  ///  - processor: Thrift processor to handle RPCs
+  ///  - port: The port the server will listen for connections on
+  ///  - auth_provider: Authentication scheme to use. If nullptr, use the global default
+  ///    demon<->demon provider.
+  ///  - metrics: if not nullptr, the server will register metrics on this object
+  ///  - num_worker_threads: the number of worker threads to use in any thread pool
+  ///  - server_type: the type of IO strategy this server should employ
+  ThriftServer(const std::string& name,
+      const boost::shared_ptr<apache::thrift::TProcessor>& processor, int port,
+      AuthProvider* auth_provider = nullptr, MetricGroup* metrics = nullptr,
+      int num_worker_threads = DEFAULT_WORKER_THREADS, ServerType server_type = Threaded);
+
+  /// Enables secure access over SSL. Must be called before Start(). The first two
+  /// arguments are paths to certificate and private key files in .PEM format,
+  /// respectively. If either file does not exist, an error is returned. The final
+  /// optional argument provides the command to run if a password is required to decrypt
+  /// the private key. It is invoked once, and the resulting password is used only for
+  /// password-protected .PEM files.
+  Status EnableSsl(const std::string& certificate, const std::string& private_key,
+      const std::string& pem_password_cmd = "", const std::string& ciphers = "");
+
   /// Creates the server socket on which this server listens. May be SSL enabled. Returns
   /// OK unless there was a Thrift error.
   Status CreateSocket(
@@ -184,6 +188,9 @@ class ThriftServer {
   /// Password string retrieved by running command in EnableSsl().
   std::string key_password_;
 
+  /// List of ciphers that are ok for clients to use when connecting.
+  std::string cipher_list_;
+
   /// How many worker threads to use to serve incoming requests
   /// (requests are queued if no thread is immediately available)
   int num_worker_threads_;
@@ -201,7 +208,7 @@ class ThriftServer {
   boost::scoped_ptr<apache::thrift::server::TServer> server_;
   boost::shared_ptr<apache::thrift::TProcessor> processor_;
 
-  /// If not NULL, called when connection events happen. Not owned by us.
+  /// If not nullptr, called when connection events happen. Not owned by us.
   ConnectionHandlerIf* connection_handler_;
 
   /// Protects connection_contexts_
@@ -237,6 +244,94 @@ class ThriftServer {
   friend class ThriftServerEventProcessor;
 };
 
+/// Helper class to build new ThriftServer instances.
+class ThriftServerBuilder {
+ public:
+  ThriftServerBuilder(const std::string& name,
+      const boost::shared_ptr<apache::thrift::TProcessor>& processor, int port)
+    : name_(name), processor_(processor), port_(port) {}
+
+  /// Sets the auth provider for this server. Default is the system global auth provider.
+  ThriftServerBuilder& auth_provider(AuthProvider* provider) {
+    auth_provider_ = provider;
+    return *this;
+  }
+
+  /// Sets the metrics instance that this server should register metrics with. Default is
+  /// nullptr.
+  ThriftServerBuilder& metrics(MetricGroup* metrics) {
+    metrics_ = metrics;
+    return *this;
+  }
+
+  /// Make this server a thread-pool server with 'num_worker_threads' threads.
+  ThriftServerBuilder& thread_pool(int num_worker_threads) {
+    server_type_ = ThriftServer::ServerType::ThreadPool;
+    num_worker_threads_ = num_worker_threads;
+    return *this;
+  }
+
+  /// Make this server a threaded server (i.e. one thread per connection).
+  ThriftServerBuilder& threaded() {
+    server_type_ = ThriftServer::ServerType::Threaded;
+    return *this;
+  }
+
+  /// Enables SSL for this server.
+  ThriftServerBuilder& ssl(
+      const std::string& certificate, const std::string& private_key) {
+    enable_ssl_ = true;
+    certificate_ = certificate;
+    private_key_ = private_key;
+    return *this;
+  }
+
+  /// Sets the command used to compute the password for the SSL private key. Default is
+  /// empty, i.e. no password needed.
+  ThriftServerBuilder& pem_password_cmd(const std::string& pem_password_cmd) {
+    pem_password_cmd_ = pem_password_cmd;
+    return *this;
+  }
+
+  /// Sets the list of acceptable cipher suites for this server. Default is to use all
+  /// available system cipher suites.
+  ThriftServerBuilder& cipher_list(const std::string& ciphers) {
+    ciphers_ = ciphers;
+    return *this;
+  }
+
+  /// Constructs a new ThriftServer and puts it in 'server', if construction was
+  /// successful, returns an error otherwise. In the error case, 'server' will not have
+  /// been set and will not need to be freed, otherwise the caller assumes ownership of
+  /// '*server'.
+  Status Build(ThriftServer** server) {
+    std::unique_ptr<ThriftServer> ptr(new ThriftServer(name_, processor_, port_,
+        auth_provider_, metrics_, num_worker_threads_, server_type_));
+    if (enable_ssl_) {
+      RETURN_IF_ERROR(
+          ptr->EnableSsl(certificate_, private_key_, pem_password_cmd_, ciphers_));
+    }
+    (*server) = ptr.release();
+    return Status::OK();
+  }
+
+ private:
+  ThriftServer::ServerType server_type_ = ThriftServer::ServerType::Threaded;
+  int num_worker_threads_ = ThriftServer::DEFAULT_WORKER_THREADS;
+  std::string name_;
+  boost::shared_ptr<apache::thrift::TProcessor> processor_;
+  int port_ = 0;
+
+  AuthProvider* auth_provider_ = nullptr;
+  MetricGroup* metrics_ = nullptr;
+
+  bool enable_ssl_ = false;
+  std::string certificate_;
+  std::string private_key_;
+  std::string pem_password_cmd_;
+  std::string ciphers_;
+};
+
 // Returns true if, per the process configuration flags, server<->server communications
 // should use SSL.
 bool EnableInternalSslConnections();

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/68df21b4/be/src/runtime/data-stream-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/data-stream-test.cc b/be/src/runtime/data-stream-test.cc
index a358286..990d8bb 100644
--- a/be/src/runtime/data-stream-test.cc
+++ b/be/src/runtime/data-stream-test.cc
@@ -454,7 +454,7 @@ class DataStreamTest : public testing::Test {
   void StartBackend() {
     boost::shared_ptr<ImpalaTestBackend> handler(new ImpalaTestBackend(stream_mgr_));
     boost::shared_ptr<TProcessor> processor(new ImpalaInternalServiceProcessor(handler));
-    server_ = new ThriftServer("DataStreamTest backend", processor, FLAGS_port, nullptr);
+    ThriftServerBuilder("DataStreamTest backend", processor, FLAGS_port).Build(&server_);
     server_->Start();
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/68df21b4/be/src/service/impala-server.cc
----------------------------------------------------------------------
diff --git a/be/src/service/impala-server.cc b/be/src/service/impala-server.cc
index 4870091..92549cf 100644
--- a/be/src/service/impala-server.cc
+++ b/be/src/service/impala-server.cc
@@ -170,6 +170,13 @@ DEFINE_string(ssl_private_key_password_cmd, "", "A Unix command whose output ret
     "then all trailing whitespace will be trimmed before it is used to decrypt the "
     "private key");
 
+// TODO: For 3.0 (compatibility-breaking release), set this to a whitelist of ciphers,
+// e.g.  https://wiki.mozilla.org/Security/Server_Side_TLS
+DEFINE_string(ssl_cipher_list, "",
+    "The cipher suite preferences to use for TLS-secured "
+    "Thrift RPC connections. Uses the OpenSSL cipher preference list format. See man (1) "
+    "ciphers for more information. If empty, the default cipher list for your platform "
+    "is used");
 
 DEFINE_int32(idle_session_timeout, 0, "The time, in seconds, that a session may be idle"
     " for before it is closed (and all running queries cancelled) by Impala. If 0, idle"
@@ -1931,19 +1938,19 @@ Status CreateImpalaServer(ExecEnv* exec_env, int beeswax_port, int hs2_port, int
         new RpcEventHandler("backend", exec_env->metrics()));
     be_processor->setEventHandler(event_handler);
 
-    *be_server = new ThriftServer("backend", be_processor, be_port, nullptr,
-        exec_env->metrics());
+    ThriftServerBuilder be_builder("backend", be_processor, be_port);
+
     if (EnableInternalSslConnections()) {
       LOG(INFO) << "Enabling SSL for backend";
-      RETURN_IF_ERROR((*be_server)->EnableSsl(FLAGS_ssl_server_certificate,
-          FLAGS_ssl_private_key, FLAGS_ssl_private_key_password_cmd));
+      be_builder.ssl(FLAGS_ssl_server_certificate, FLAGS_ssl_private_key)
+          .pem_password_cmd(FLAGS_ssl_private_key_password_cmd)
+          .cipher_list(FLAGS_ssl_cipher_list);
     }
-
+    RETURN_IF_ERROR(be_builder.metrics(exec_env->metrics()).Build(be_server));
     LOG(INFO) << "ImpalaInternalService listening on " << be_port;
   }
 
   if (!FLAGS_is_coordinator) {
-
     LOG(INFO) << "Started executor Impala server on "
               << ExecEnv::GetInstance()->backend_address();
     return Status::OK();
@@ -1958,16 +1965,20 @@ Status CreateImpalaServer(ExecEnv* exec_env, int beeswax_port, int hs2_port, int
     boost::shared_ptr<TProcessorEventHandler> event_handler(
         new RpcEventHandler("beeswax", exec_env->metrics()));
     beeswax_processor->setEventHandler(event_handler);
-    *beeswax_server = new ThriftServer(BEESWAX_SERVER_NAME, beeswax_processor,
-        beeswax_port, AuthManager::GetInstance()->GetExternalAuthProvider(),
-        exec_env->metrics(), FLAGS_fe_service_threads, ThriftServer::ThreadPool);
+    ThriftServerBuilder builder(BEESWAX_SERVER_NAME, beeswax_processor, beeswax_port);
 
-    (*beeswax_server)->SetConnectionHandler(impala_server->get());
     if (!FLAGS_ssl_server_certificate.empty()) {
       LOG(INFO) << "Enabling SSL for Beeswax";
-      RETURN_IF_ERROR((*beeswax_server)->EnableSsl(FLAGS_ssl_server_certificate,
-          FLAGS_ssl_private_key, FLAGS_ssl_private_key_password_cmd));
+      builder.ssl(FLAGS_ssl_server_certificate, FLAGS_ssl_private_key)
+          .pem_password_cmd(FLAGS_ssl_private_key_password_cmd)
+          .cipher_list(FLAGS_ssl_cipher_list);
     }
+    RETURN_IF_ERROR(
+        builder.auth_provider(AuthManager::GetInstance()->GetExternalAuthProvider())
+            .metrics(exec_env->metrics())
+            .thread_pool(FLAGS_fe_service_threads)
+            .Build(beeswax_server));
+    (*beeswax_server)->SetConnectionHandler(impala_server->get());
 
     LOG(INFO) << "Impala Beeswax Service listening on " << beeswax_port;
   }
@@ -1980,17 +1991,22 @@ Status CreateImpalaServer(ExecEnv* exec_env, int beeswax_port, int hs2_port, int
         new RpcEventHandler("hs2", exec_env->metrics()));
     hs2_fe_processor->setEventHandler(event_handler);
 
-    *hs2_server = new ThriftServer(HS2_SERVER_NAME, hs2_fe_processor, hs2_port,
-        AuthManager::GetInstance()->GetExternalAuthProvider(), exec_env->metrics(),
-        FLAGS_fe_service_threads, ThriftServer::ThreadPool);
+    ThriftServerBuilder builder(HS2_SERVER_NAME, hs2_fe_processor, hs2_port);
 
-    (*hs2_server)->SetConnectionHandler(impala_server->get());
     if (!FLAGS_ssl_server_certificate.empty()) {
       LOG(INFO) << "Enabling SSL for HiveServer2";
-      RETURN_IF_ERROR((*hs2_server)->EnableSsl(FLAGS_ssl_server_certificate,
-          FLAGS_ssl_private_key, FLAGS_ssl_private_key_password_cmd));
+      builder.ssl(FLAGS_ssl_server_certificate, FLAGS_ssl_private_key)
+          .pem_password_cmd(FLAGS_ssl_private_key_password_cmd)
+          .cipher_list(FLAGS_ssl_cipher_list);
     }
 
+    RETURN_IF_ERROR(
+        builder.auth_provider(AuthManager::GetInstance()->GetExternalAuthProvider())
+            .metrics(exec_env->metrics())
+            .thread_pool(FLAGS_fe_service_threads)
+            .Build(hs2_server));
+    (*hs2_server)->SetConnectionHandler(impala_server->get());
+
     LOG(INFO) << "Impala HiveServer2 Service listening on " << hs2_port;
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/68df21b4/be/src/statestore/statestore-subscriber.cc
----------------------------------------------------------------------
diff --git a/be/src/statestore/statestore-subscriber.cc b/be/src/statestore/statestore-subscriber.cc
index 12efdcd..69ddfdc 100644
--- a/be/src/statestore/statestore-subscriber.cc
+++ b/be/src/statestore/statestore-subscriber.cc
@@ -47,11 +47,12 @@ DEFINE_int32(statestore_subscriber_cnxn_attempts, 10, "The number of times to re
     "RPC connection to the statestore. A setting of 0 means retry indefinitely");
 DEFINE_int32(statestore_subscriber_cnxn_retry_interval_ms, 3000, "The interval, in ms, "
     "to wait between attempts to make an RPC connection to the statestore.");
-DECLARE_string(ssl_client_ca_certificate);
 
+DECLARE_string(ssl_client_ca_certificate);
 DECLARE_string(ssl_server_certificate);
 DECLARE_string(ssl_private_key);
 DECLARE_string(ssl_private_key_password_cmd);
+DECLARE_string(ssl_cipher_list);
 
 namespace impala {
 
@@ -192,13 +193,18 @@ Status StatestoreSubscriber::Start() {
         new RpcEventHandler("statestore-subscriber", metrics_));
     processor->setEventHandler(event_handler);
 
-    heartbeat_server_.reset(new ThriftServer("StatestoreSubscriber", processor,
-        heartbeat_address_.port, NULL, NULL, 5));
+    ThriftServerBuilder builder(
+        "StatestoreSubscriber", processor, heartbeat_address_.port);
     if (EnableInternalSslConnections()) {
       LOG(INFO) << "Enabling SSL for Statestore subscriber";
-      RETURN_IF_ERROR(heartbeat_server_->EnableSsl(FLAGS_ssl_server_certificate,
-          FLAGS_ssl_private_key, FLAGS_ssl_private_key_password_cmd));
+      builder.ssl(FLAGS_ssl_server_certificate, FLAGS_ssl_private_key)
+          .pem_password_cmd(FLAGS_ssl_private_key_password_cmd)
+          .cipher_list(FLAGS_ssl_cipher_list);
     }
+
+    ThriftServer* server;
+    RETURN_IF_ERROR(builder.Build(&server));
+    heartbeat_server_.reset(server);
     RETURN_IF_ERROR(heartbeat_server_->Start());
 
     LOG(INFO) << "Registering with statestore";

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/68df21b4/be/src/statestore/statestored-main.cc
----------------------------------------------------------------------
diff --git a/be/src/statestore/statestored-main.cc b/be/src/statestore/statestored-main.cc
index 40e2a73..f4c4672 100644
--- a/be/src/statestore/statestored-main.cc
+++ b/be/src/statestore/statestored-main.cc
@@ -42,6 +42,7 @@ DECLARE_string(principal);
 DECLARE_string(ssl_server_certificate);
 DECLARE_string(ssl_private_key);
 DECLARE_string(ssl_private_key_password_cmd);
+DECLARE_string(ssl_cipher_list);
 
 #include "common/names.h"
 
@@ -83,13 +84,15 @@ int StatestoredMain(int argc, char** argv) {
       new RpcEventHandler("statestore", metrics.get()));
   processor->setEventHandler(event_handler);
 
-  ThriftServer* server = new ThriftServer("StatestoreService", processor,
-      FLAGS_state_store_port, NULL, metrics.get(), 5);
+  ThriftServer* server;
+  ThriftServerBuilder builder("StatestoreService", processor, FLAGS_state_store_port);
   if (EnableInternalSslConnections()) {
     LOG(INFO) << "Enabling SSL for Statestore";
-    ABORT_IF_ERROR(server->EnableSsl(FLAGS_ssl_server_certificate, FLAGS_ssl_private_key,
-        FLAGS_ssl_private_key_password_cmd));
+    builder.ssl(FLAGS_ssl_server_certificate, FLAGS_ssl_private_key)
+        .pem_password_cmd(FLAGS_ssl_private_key_password_cmd)
+        .cipher_list(FLAGS_ssl_cipher_list);
   }
+  ABORT_IF_ERROR(builder.metrics(metrics.get()).Build(&server));
   ABORT_IF_ERROR(server->Start());
 
   statestore.MainLoop();

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/68df21b4/be/src/testutil/in-process-servers.cc
----------------------------------------------------------------------
diff --git a/be/src/testutil/in-process-servers.cc b/be/src/testutil/in-process-servers.cc
index 015e7e2..7a81915 100644
--- a/be/src/testutil/in-process-servers.cc
+++ b/be/src/testutil/in-process-servers.cc
@@ -162,13 +162,14 @@ Status InProcessStatestore::Start() {
   boost::shared_ptr<TProcessor> processor(
       new StatestoreServiceProcessor(statestore_->thrift_iface()));
 
-  statestore_server_.reset(new ThriftServer("StatestoreService", processor,
-      statestore_port_, NULL, metrics_.get(), 5));
+  ThriftServerBuilder builder("StatestoreService", processor, statestore_port_);
   if (EnableInternalSslConnections()) {
     LOG(INFO) << "Enabling SSL for Statestore";
-    ABORT_IF_ERROR(statestore_server_->EnableSsl(
-        FLAGS_ssl_server_certificate, FLAGS_ssl_private_key));
+    builder.ssl(FLAGS_ssl_server_certificate, FLAGS_ssl_private_key);
   }
+  ThriftServer* server;
+  ABORT_IF_ERROR(builder.metrics(metrics_.get()).Build(&server));
+  statestore_server_.reset(server);
   statestore_main_loop_.reset(
       new Thread("statestore", "main-loop", &Statestore::MainLoop, statestore_.get()));
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/68df21b4/be/src/testutil/scoped-flag-setter.h
----------------------------------------------------------------------
diff --git a/be/src/testutil/scoped-flag-setter.h b/be/src/testutil/scoped-flag-setter.h
new file mode 100644
index 0000000..49fa449
--- /dev/null
+++ b/be/src/testutil/scoped-flag-setter.h
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef IMPALA_TESTUTIL_SCOPED_FLAG_SETTER_H
+#define IMPALA_TESTUTIL_SCOPED_FLAG_SETTER_H
+
+namespace impala {
+
+/// Temporarily sets a flag for the duration of its scope, then resets the flag to its
+/// original value upon destruction.
+//
+/// Example (pre-condition: FLAGS_my_string_flag == "world"):
+/// {
+///   auto s = ScopedFlagSetter<string>::Make(&FLAGS_my_string_flag, "hello");
+///   // ... FLAGS_my_string_flag == "hello" for entire scope
+/// }
+/// // After destruction of 's', FLAGS_my_string_flag == "world" again.
+template <typename T>
+struct ScopedFlagSetter {
+  static ScopedFlagSetter<T> Make(T* f, const T& new_val) {
+    return ScopedFlagSetter(f, new_val);
+  }
+
+  ~ScopedFlagSetter() { *flag = old_val; }
+
+ private:
+  ScopedFlagSetter(T* f, T new_val) {
+    flag = f;
+    old_val = *f;
+    *f = new_val;
+  }
+
+  T* flag;
+  T old_val;
+};
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/68df21b4/be/src/util/webserver-test.cc
----------------------------------------------------------------------
diff --git a/be/src/util/webserver-test.cc b/be/src/util/webserver-test.cc
index 7298ee8..1abbe42 100644
--- a/be/src/util/webserver-test.cc
+++ b/be/src/util/webserver-test.cc
@@ -21,10 +21,11 @@
 #include <boost/lexical_cast.hpp>
 #include <gutil/strings/substitute.h>
 
+#include "common/init.h"
 #include "testutil/gtest-util.h"
+#include "testutil/scoped-flag-setter.h"
 #include "util/webserver.h"
 #include "util/default-path-handlers.h"
-#include "common/init.h"
 
 DECLARE_int32(webserver_port);
 DECLARE_string(webserver_password_file);
@@ -201,25 +202,10 @@ TEST(Webserver, EscapeErrorUriTest) {
       string::npos);
 }
 
-template<typename T>
-struct ScopedFlagSetter {
-  T* flag;
-  T old_val;
-  ScopedFlagSetter(T* f, T new_val) {
-    flag = f;
-    old_val = *f;
-    *f = new_val;
-  }
-
-  ~ScopedFlagSetter() {
-    *flag = old_val;
-  }
-};
-
 TEST(Webserver, SslTest) {
-  ScopedFlagSetter<string> certificate(&FLAGS_webserver_certificate_file,
+  auto cert = ScopedFlagSetter<string>::Make(&FLAGS_webserver_certificate_file,
       Substitute("$0/be/src/testutil/server-cert.pem", getenv("IMPALA_HOME")));
-  ScopedFlagSetter<string> private_key(&FLAGS_webserver_private_key_file,
+  auto key = ScopedFlagSetter<string>::Make(&FLAGS_webserver_private_key_file,
       Substitute("$0/be/src/testutil/server-key.pem", getenv("IMPALA_HOME")));
 
   Webserver webserver(FLAGS_webserver_port);
@@ -227,9 +213,9 @@ TEST(Webserver, SslTest) {
 }
 
 TEST(Webserver, SslBadCertTest) {
-  ScopedFlagSetter<string> certificate(&FLAGS_webserver_certificate_file,
+  auto cert = ScopedFlagSetter<string>::Make(&FLAGS_webserver_certificate_file,
       Substitute("$0/be/src/testutil/invalid-server-cert.pem", getenv("IMPALA_HOME")));
-  ScopedFlagSetter<string> private_key(&FLAGS_webserver_private_key_file,
+  auto key = ScopedFlagSetter<string>::Make(&FLAGS_webserver_private_key_file,
       Substitute("$0/be/src/testutil/server-key.pem", getenv("IMPALA_HOME")));
 
   Webserver webserver(FLAGS_webserver_port);
@@ -237,11 +223,11 @@ TEST(Webserver, SslBadCertTest) {
 }
 
 TEST(Webserver, SslWithPrivateKeyPasswordTest) {
-  ScopedFlagSetter<string> certificate(&FLAGS_webserver_certificate_file,
+  auto cert = ScopedFlagSetter<string>::Make(&FLAGS_webserver_certificate_file,
       Substitute("$0/be/src/testutil/server-cert.pem", getenv("IMPALA_HOME")));
-  ScopedFlagSetter<string> private_key(&FLAGS_webserver_private_key_file,
+  auto key = ScopedFlagSetter<string>::Make(&FLAGS_webserver_private_key_file,
       Substitute("$0/be/src/testutil/server-key-password.pem", getenv("IMPALA_HOME")));
-  ScopedFlagSetter<string> password_cmd(
+  auto cmd = ScopedFlagSetter<string>::Make(
       &FLAGS_webserver_private_key_password_cmd, "echo password");
 
   Webserver webserver(FLAGS_webserver_port);
@@ -249,11 +235,11 @@ TEST(Webserver, SslWithPrivateKeyPasswordTest) {
 }
 
 TEST(Webserver, SslBadPrivateKeyPasswordTest) {
-  ScopedFlagSetter<string> certificate(&FLAGS_webserver_certificate_file,
+  auto cert = ScopedFlagSetter<string>::Make(&FLAGS_webserver_certificate_file,
       Substitute("$0/be/src/testutil/server-cert.pem", getenv("IMPALA_HOME")));
-  ScopedFlagSetter<string> private_key(&FLAGS_webserver_private_key_file,
+  auto key = ScopedFlagSetter<string>::Make(&FLAGS_webserver_private_key_file,
       Substitute("$0/be/src/testutil/server-key-password.pem", getenv("IMPALA_HOME")));
-  ScopedFlagSetter<string> password_cmd(
+  auto cmd = ScopedFlagSetter<string>::Make(
       &FLAGS_webserver_private_key_password_cmd, "echo wrongpassword");
 
   Webserver webserver(FLAGS_webserver_port);
@@ -263,8 +249,8 @@ TEST(Webserver, SslBadPrivateKeyPasswordTest) {
 TEST(Webserver, StartWithPasswordFileTest) {
   stringstream password_file;
   password_file << getenv("IMPALA_HOME") << "/be/src/testutil/htpasswd";
-  ScopedFlagSetter<string> password_flag(&FLAGS_webserver_password_file,
-      password_file.str());
+  auto password =
+      ScopedFlagSetter<string>::Make(&FLAGS_webserver_password_file, password_file.str());
 
   Webserver webserver(FLAGS_webserver_port);
   ASSERT_OK(webserver.Start());
@@ -277,8 +263,8 @@ TEST(Webserver, StartWithPasswordFileTest) {
 TEST(Webserver, StartWithMissingPasswordFileTest) {
   stringstream password_file;
   password_file << getenv("IMPALA_HOME") << "/be/src/testutil/doesntexist";
-  ScopedFlagSetter<string> password_flag(&FLAGS_webserver_password_file,
-      password_file.str());
+  auto password =
+      ScopedFlagSetter<string>::Make(&FLAGS_webserver_password_file, password_file.str());
 
   Webserver webserver(FLAGS_webserver_port);
   ASSERT_FALSE(webserver.Start().ok());
@@ -314,8 +300,8 @@ TEST(Webserver, NoFrameEmbeddingTest) {
 }
 TEST(Webserver, FrameAllowEmbeddingTest) {
   const string FRAME_TEST_PATH = "/frames_test";
-  ScopedFlagSetter<string> webserver_x_frame_options(&FLAGS_webserver_x_frame_options,
-      "ALLOWALL");
+  auto x_frame_opt =
+      ScopedFlagSetter<string>::Make(&FLAGS_webserver_x_frame_options, "ALLOWALL");
   Webserver webserver(FLAGS_webserver_port);
   Webserver::UrlCallback callback = bind<void>(FrameCallback, _1, _2);
   webserver.RegisterUrlCallback(FRAME_TEST_PATH, "raw_text.tmpl", callback);


[11/11] incubator-impala git commit: IMPALA-5661: buffer pool limit

Posted by ta...@apache.org.
IMPALA-5661: buffer pool limit

Adds the --buffer_pool_limit flag to control the buffer pool size.
It can be specified as either an absolute memory value or a percentage
of the process memory limit

Testing:
Started up a cluster with --buffer_pool_limit=10%, confirmed via
/metrics page that the buffer pool limit was reduced to ~800MB on
my system.

Change-Id: Ia64e21e0d5a7cf35a9064f365c6c86db13fbd73d
Reviewed-on: http://gerrit.cloudera.org:8080/7462
Reviewed-by: Matthew Jacobs <mj...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/f14e68c7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/f14e68c7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/f14e68c7

Branch: refs/heads/master
Commit: f14e68c7255927a595b5fc017cc86960f59bc8df
Parents: 68df21b
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Tue Jul 18 16:57:30 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Tue Aug 8 10:45:33 2017 +0000

----------------------------------------------------------------------
 be/src/common/constant-strings.h          | 31 ++++++++++++++++++++++++++
 be/src/common/global-flags.cc             | 22 ++++++++++++++----
 be/src/runtime/exec-env.cc                | 19 ++++++++++------
 be/src/scheduling/request-pool-service.cc | 13 ++++++-----
 4 files changed, 68 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f14e68c7/be/src/common/constant-strings.h
----------------------------------------------------------------------
diff --git a/be/src/common/constant-strings.h b/be/src/common/constant-strings.h
new file mode 100644
index 0000000..66f0b0b
--- /dev/null
+++ b/be/src/common/constant-strings.h
@@ -0,0 +1,31 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This file includes constant strings that are used in multiple places in the codebase.
+
+#ifndef IMPALA_COMMON_CONSTANT_STRINGS_H_
+#define IMPALA_COMMON_CONSTANT_STRINGS_H_
+
+// Template for a description of the ways to specify bytes. strings::Substitute() must
+// used to fill in the blanks.
+#define MEM_UNITS_HELP_MSG "Specified as number of bytes ('<int>[bB]?'), " \
+                          "megabytes ('<float>[mM]'), " \
+                          "gigabytes ('<float>[gG]'), " \
+                          "or percentage of $0 ('<int>%'). " \
+                          "Defaults to bytes if no unit is given."
+
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f14e68c7/be/src/common/global-flags.cc
----------------------------------------------------------------------
diff --git a/be/src/common/global-flags.cc b/be/src/common/global-flags.cc
index 0a6ca28..98417f2 100644
--- a/be/src/common/global-flags.cc
+++ b/be/src/common/global-flags.cc
@@ -21,7 +21,13 @@
 // a main()), or flags that are referenced from multiple places and having them here
 // calms the linker errors that would otherwise ensue.
 
+#include <string>
+
+#include "common/constant-strings.h"
 #include "common/logging.h"
+#include "gutil/strings/substitute.h"
+
+#include "common/names.h"
 
 // This will be defaulted to the host name returned by the OS.
 // This name is used in the principal generated for Kerberos authorization.
@@ -41,10 +47,18 @@ DEFINE_string(krb5_conf, "", "Absolute path to Kerberos krb5.conf if in a non-st
     "location. Does not normally need to be set.");
 DEFINE_string(krb5_debug_file, "", "Turn on Kerberos debugging and output to this file");
 
-DEFINE_string(mem_limit, "80%", "Process memory limit specified as number of bytes "
-              "('<int>[bB]?'), megabytes ('<float>[mM]'), gigabytes ('<float>[gG]'), "
-              "or percentage of the physical memory ('<int>%'). "
-              "Defaults to bytes if no unit is given");
+static const string mem_limit_help_msg = "Limit on process memory consumption, "
+    "excluding the JVM's memory consumption. "
+    + Substitute(MEM_UNITS_HELP_MSG, "the physical memory");
+DEFINE_string(mem_limit, "80%",  mem_limit_help_msg.c_str());
+
+static const string buffer_pool_limit_help_msg = "(Advanced) Limit on buffer pool size. "
+     + Substitute(MEM_UNITS_HELP_MSG, "the process memory limit") + " "
+    "The default value and behaviour of this flag may change between releases.";
+DEFINE_string(buffer_pool_limit, "80%", buffer_pool_limit_help_msg.c_str());
+
+DEFINE_int64(min_buffer_size, 64 * 1024,
+    "(Advanced) The minimum buffer size to use in the buffer pool");
 
 DEFINE_bool(enable_process_lifetime_heap_profiling, false, "(Advanced) Enables heap "
     "profiling for the lifetime of the process. Profile output will be stored in the "

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f14e68c7/be/src/runtime/exec-env.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/exec-env.cc b/be/src/runtime/exec-env.cc
index f2ee6f0..8239a68 100644
--- a/be/src/runtime/exec-env.cc
+++ b/be/src/runtime/exec-env.cc
@@ -75,14 +75,14 @@ DEFINE_int32(state_store_subscriber_port, 23000,
 DEFINE_int32(num_hdfs_worker_threads, 16,
     "(Advanced) The number of threads in the global HDFS operation pool");
 DEFINE_bool(disable_admission_control, false, "Disables admission control.");
-DEFINE_int64(min_buffer_size, 64 * 1024,
-    "(Advanced) The minimum buffer size to use in the buffer pool");
 
 DECLARE_int32(state_store_port);
 DECLARE_int32(num_threads_per_core);
 DECLARE_int32(num_cores);
 DECLARE_int32(be_port);
 DECLARE_string(mem_limit);
+DECLARE_string(buffer_pool_limit);
+DECLARE_int64(min_buffer_size);
 DECLARE_bool(is_coordinator);
 DECLARE_int32(webserver_port);
 
@@ -241,9 +241,14 @@ Status ExecEnv::StartServices() {
     return Status(Substitute(
         "--min_buffer_size must be a power-of-two: $0", FLAGS_min_buffer_size));
   }
-  int64_t buffer_pool_capacity = BitUtil::RoundDown(
-      no_process_mem_limit ? system_mem : bytes_limit * 4 / 5, FLAGS_min_buffer_size);
-  InitBufferPool(FLAGS_min_buffer_size, buffer_pool_capacity);
+  int64_t buffer_pool_limit = ParseUtil::ParseMemSpec(FLAGS_buffer_pool_limit,
+      &is_percent, no_process_mem_limit ? system_mem : bytes_limit);
+  if (buffer_pool_limit <= 0) {
+    return Status(Substitute("Invalid --buffer_pool_limit value, must be a percentage or "
+          "positive bytes value: $0", FLAGS_buffer_pool_limit));
+  }
+  buffer_pool_limit = BitUtil::RoundDown(buffer_pool_limit, FLAGS_min_buffer_size);
+  InitBufferPool(FLAGS_min_buffer_size, buffer_pool_limit);
 
   metrics_->Init(enable_webserver_ ? webserver_.get() : nullptr);
   impalad_client_cache_->InitMetrics(metrics_.get(), "impala-server.backends");
@@ -282,8 +287,8 @@ Status ExecEnv::StartServices() {
   }
   LOG(INFO) << "Using global memory limit: "
             << PrettyPrinter::Print(bytes_limit, TUnit::BYTES);
-  LOG(INFO) << "Buffer pool capacity: "
-            << PrettyPrinter::Print(buffer_pool_capacity, TUnit::BYTES);
+  LOG(INFO) << "Buffer pool limit: "
+            << PrettyPrinter::Print(buffer_pool_limit, TUnit::BYTES);
 
   RETURN_IF_ERROR(disk_io_mgr_->Init(mem_tracker_.get()));
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f14e68c7/be/src/scheduling/request-pool-service.cc
----------------------------------------------------------------------
diff --git a/be/src/scheduling/request-pool-service.cc b/be/src/scheduling/request-pool-service.cc
index b674bf0..bad5ac1 100644
--- a/be/src/scheduling/request-pool-service.cc
+++ b/be/src/scheduling/request-pool-service.cc
@@ -23,6 +23,7 @@
 #include <string>
 #include <gutil/strings/substitute.h>
 
+#include "common/constant-strings.h"
 #include "common/logging.h"
 #include "rpc/jni-thrift-util.h"
 #include "service/query-options.h"
@@ -56,12 +57,12 @@ DEFINE_int64(default_pool_max_requests, -1, "Maximum number of concurrent outsta
     "requests allowed to run before queueing incoming requests. A negative value "
     "indicates no limit. 0 indicates no requests will be admitted. Ignored if "
     "fair_scheduler_config_path and llama_site_path are set.");
-DEFINE_string(default_pool_mem_limit, "", "Maximum amount of memory that all "
-    "outstanding requests in this pool may use before new requests to this pool"
-    " are queued. Specified as a number of bytes ('<int>[bB]?'), megabytes "
-    "('<float>[mM]'), gigabytes ('<float>[gG]'), or percentage of the physical memory "
-    "('<int>%'). 0 or not setting indicates no limit. Defaults to bytes if no unit is "
-    "given. Ignored if fair_scheduler_config_path and llama_site_path are set.");
+
+static const string default_pool_mem_limit_help_msg = "Maximum amount of memory that all "
+    "outstanding requests in this pool may use before new requests to this pool "
+    "are queued. " + Substitute(MEM_UNITS_HELP_MSG, "the physical memory") + " "
+    "Ignored if fair_scheduler_config_path and llama_site_path are set.";
+DEFINE_string(default_pool_mem_limit, "", default_pool_mem_limit_help_msg.c_str());
 DEFINE_int64(default_pool_max_queued, 200, "Maximum number of requests allowed to be "
     "queued before rejecting requests. A negative value or 0 indicates requests "
     "will always be rejected once the maximum number of concurrent requests are "


[02/11] incubator-impala git commit: IMPALA-5546: Allow creating unpartitioned Kudu tables

Posted by ta...@apache.org.
IMPALA-5546: Allow creating unpartitioned Kudu tables

This patch makes it possible to create unpartitioned, managed Kudu
tables from Impala, by making the 'PARTITION BY' clause of 'CREATE
TABLE... STORED AS KUDU' optional:

CREATE TABLE [IF NOT EXISTS] [db_name.]table_name
  (col_name data_type
    [kudu_column_attribute ...]
    [COMMENT 'col_comment']
    [, ...]
    [PRIMARY KEY (col_name[, ...])]
  )
  [PARTITION BY kudu_partition_clause]
  [COMMENT 'table_comment']
  STORED AS KUDU
  [TBLPROPERTIES ('key1'='value1', 'key2'='value2', ...)]

Kudu represents this as a table that is range partitioned on no
columns.

Because unpartitioned Kudu tables are inefficient for large data
sizes, and because the syntax doesn't make it explicit that the table
will be unpartitioned, there is a warning issued to encourage users
to created partitioned tables.

This patch also converts the tpch_kudu.nation and tpch_kudu.region
tables to be unpartitioned, as they are very small.

Testing:
- Updated analysis tests.
- Added e2e test that creates unpartitioned table and inserts into it.

Change-Id: I281f173dbec1484eb13434d53ea581a0f245358a
Reviewed-on: http://gerrit.cloudera.org:8080/7446
Reviewed-by: Thomas Tauber-Marshall <tm...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/b881fba7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/b881fba7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/b881fba7

Branch: refs/heads/master
Commit: b881fba7633f138c251532f3c58255689db4e22b
Parents: 3deb1a9
Author: Thomas Tauber-Marshall <tm...@cloudera.com>
Authored: Mon Jul 17 12:25:45 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Mon Aug 7 19:53:59 2017 +0000

----------------------------------------------------------------------
 .../apache/impala/analysis/CreateTableStmt.java |  4 +-
 .../impala/service/KuduCatalogOpExecutor.java   |  4 ++
 .../apache/impala/analysis/AnalyzeDDLTest.java  | 15 +++++---
 testdata/datasets/tpch/tpch_kudu_template.sql   |  2 -
 testdata/datasets/tpch/tpch_schema_template.sql |  4 +-
 .../queries/QueryTest/kudu_create.test          | 40 ++++++++++++++++++++
 6 files changed, 58 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b881fba7/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java b/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java
index 17ac46d..6169997 100644
--- a/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java
+++ b/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java
@@ -320,8 +320,8 @@ public class CreateTableStmt extends StatementBase {
     if (!getKuduPartitionParams().isEmpty()) {
       analyzeKuduPartitionParams(analyzer);
     } else {
-      throw new AnalysisException("Table partitioning must be specified for " +
-          "managed Kudu tables.");
+      analyzer.addWarning(
+          "Unpartitioned Kudu tables are inefficient for large data sizes.");
     }
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b881fba7/fe/src/main/java/org/apache/impala/service/KuduCatalogOpExecutor.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/service/KuduCatalogOpExecutor.java b/fe/src/main/java/org/apache/impala/service/KuduCatalogOpExecutor.java
index cbbfccf..c81aca4 100644
--- a/fe/src/main/java/org/apache/impala/service/KuduCatalogOpExecutor.java
+++ b/fe/src/main/java/org/apache/impala/service/KuduCatalogOpExecutor.java
@@ -181,6 +181,10 @@ public class KuduCatalogOpExecutor {
       if (!hasRangePartitioning) {
         tableOpts.setRangePartitionColumns(Collections.<String>emptyList());
       }
+    } else {
+      // This table is unpartitioned, which Kudu represents as a table range partitioned
+      // on no columns.
+      tableOpts.setRangePartitionColumns(Collections.<String>emptyList());
     }
 
     // Set the number of table replicas, if specified.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b881fba7/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java
----------------------------------------------------------------------
diff --git a/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java b/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java
index 6928ed2..2a3e383 100644
--- a/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java
+++ b/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java
@@ -1587,6 +1587,10 @@ public class AnalyzeDDLTest extends FrontendTestBase {
         "partition value = 30) stored as kudu as select id, bool_col, tinyint_col, " +
         "smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, " +
         "string_col from functional.alltypestiny");
+    // Creating unpartitioned table results in a warning.
+    AnalyzesOk("create table t primary key(id) stored as kudu as select id, bool_col " +
+        "from functional.alltypestiny",
+        "Unpartitioned Kudu tables are inefficient for large data sizes.");
     // CTAS in an external Kudu table
     AnalysisError("create external table t stored as kudu " +
         "tblproperties('kudu.table_name'='t') as select id, int_col from " +
@@ -2197,9 +2201,10 @@ public class AnalyzeDDLTest extends FrontendTestBase {
     AnalysisError("create table tab (x int) tblproperties (" +
         "'storage_handler'='com.cloudera.kudu.hive.KuduStorageHandler')",
         CreateTableStmt.KUDU_STORAGE_HANDLER_ERROR_MESSAGE);
-    AnalysisError("create table tab (x int primary key) stored as kudu tblproperties (" +
+    // Creating unpartitioned table results in a warning.
+    AnalyzesOk("create table tab (x int primary key) stored as kudu tblproperties (" +
         "'storage_handler'='com.cloudera.kudu.hive.KuduStorageHandler')",
-        "Table partitioning must be specified for managed Kudu tables.");
+        "Unpartitioned Kudu tables are inefficient for large data sizes.");
     // Invalid value for number of replicas
     AnalysisError("create table t (x int primary key) stored as kudu tblproperties (" +
         "'kudu.num_tablet_replicas'='1.1')",
@@ -2211,9 +2216,9 @@ public class AnalyzeDDLTest extends FrontendTestBase {
     AnalysisError("create table tab (a int primary key) partition by hash (a) " +
         "partitions 3 stored as kudu location '/test-warehouse/'",
         "LOCATION cannot be specified for a Kudu table.");
-    // PARTITION BY is required for managed tables.
-    AnalysisError("create table tab (a int, primary key (a)) stored as kudu",
-        "Table partitioning must be specified for managed Kudu tables.");
+    // Creating unpartitioned table results in a warning.
+    AnalyzesOk("create table tab (a int, primary key (a)) stored as kudu",
+        "Unpartitioned Kudu tables are inefficient for large data sizes.");
     AnalysisError("create table tab (a int) stored as kudu",
         "A primary key is required for a Kudu table.");
     // Using ROW FORMAT with a Kudu table

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b881fba7/testdata/datasets/tpch/tpch_kudu_template.sql
----------------------------------------------------------------------
diff --git a/testdata/datasets/tpch/tpch_kudu_template.sql b/testdata/datasets/tpch/tpch_kudu_template.sql
index 62fa072..032a19a 100644
--- a/testdata/datasets/tpch/tpch_kudu_template.sql
+++ b/testdata/datasets/tpch/tpch_kudu_template.sql
@@ -122,7 +122,6 @@ CREATE TABLE IF NOT EXISTS {target_db_name}.nation (
   N_REGIONKEY BIGINT,
   N_COMMENT STRING
 )
-partition by hash (n_nationkey) partitions {buckets}
 STORED AS KUDU
 tblproperties ('kudu.master_addresses' = '{kudu_master}:7051');
 
@@ -134,7 +133,6 @@ CREATE TABLE IF NOT EXISTS {target_db_name}.region (
   R_NAME STRING,
   R_COMMENT STRING
 )
-partition by hash (r_regionkey) partitions {buckets}
 STORED AS KUDU
 tblproperties ('kudu.master_addresses' = '{kudu_master}:7051');
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b881fba7/testdata/datasets/tpch/tpch_schema_template.sql
----------------------------------------------------------------------
diff --git a/testdata/datasets/tpch/tpch_schema_template.sql b/testdata/datasets/tpch/tpch_schema_template.sql
index 2f99fd0..dfba06d 100644
--- a/testdata/datasets/tpch/tpch_schema_template.sql
+++ b/testdata/datasets/tpch/tpch_schema_template.sql
@@ -179,7 +179,7 @@ create table if not exists {db_name}{db_suffix}.{table_name} (
   N_REGIONKEY SMALLINT,
   N_COMMENT STRING
 )
-partition by hash (n_nationkey) partitions 9 stored as kudu;
+stored as kudu;
 ---- DEPENDENT_LOAD
 INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name};
 ---- LOAD
@@ -202,7 +202,7 @@ create table if not exists {db_name}{db_suffix}.{table_name} (
   R_NAME STRING,
   R_COMMENT STRING
 )
-partition by hash (r_regionkey) partitions 9 stored as kudu;
+stored as kudu;
 ---- DEPENDENT_LOAD
 INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name};
 ---- LOAD

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b881fba7/testdata/workloads/functional-query/queries/QueryTest/kudu_create.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/kudu_create.test b/testdata/workloads/functional-query/queries/QueryTest/kudu_create.test
index 4aaed16..f6e16e1 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/kudu_create.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/kudu_create.test
@@ -248,3 +248,43 @@ I, TS1, TS2
 ---- TYPES
 INT,TIMESTAMP,TIMESTAMP
 ====
+---- QUERY
+# create an unpartitioned table
+create table unpartitioned_kudu_table (col0 bigint primary key, col1 string)
+stored as kudu
+---- RESULTS
+---- ERRORS
+Unpartitioned Kudu tables are inefficient for large data sizes.
+====
+---- QUERY
+insert into unpartitioned_kudu_table values (0, 'zero'), (1, 'one')
+---- RUNTIME_PROFILE
+NumModifiedRows: 2
+NumRowErrors: 0
+---- LABELS
+COL0,COL1
+---- DML_RESULTS: unpartitioned_kudu_table
+0,'zero'
+1,'one'
+---- TYPES
+BIGINT,STRING
+====
+---- QUERY
+create table unpartitioned_kudu_table2 primary key(id) stored as kudu
+as select id from functional.alltypestiny where id > 4
+---- RESULTS
+'Inserted 3 row(s)'
+---- ERRORS
+Unpartitioned Kudu tables are inefficient for large data sizes.
+====
+---- QUERY
+select * from unpartitioned_kudu_table2
+---- RESULTS
+5
+6
+7
+---- LABELS
+ID
+---- TYPES
+INT
+====
\ No newline at end of file


[08/11] incubator-impala git commit: IMPALA-4674: Part 2.5: Rename BufferedTupleStreamV2

Posted by ta...@apache.org.
IMPALA-4674: Part 2.5: Rename BufferedTupleStreamV2

This is cleanup that wasn't included in Part 2.

Testing:
Confirmed that everything (including be tests) built ok,
buffered-tuple-stream-v2-test passed and that I could
run a couple of basic queries.

Change-Id: Ib8b23d7c2d7488d9f74b08cc9adb4ed1a93e3591
Reviewed-on: http://gerrit.cloudera.org:8080/7609
Reviewed-by: Matthew Jacobs <mj...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/0c46147e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/0c46147e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/0c46147e

Branch: refs/heads/master
Commit: 0c46147e5fd93e2a9a63d145d60b656d2f6a7612
Parents: 5caadbb
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Mon Aug 7 09:07:00 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Tue Aug 8 10:22:20 2017 +0000

----------------------------------------------------------------------
 be/src/exec/analytic-eval-node.cc               |    8 +-
 be/src/exec/analytic-eval-node.h                |    4 +-
 be/src/exec/hash-table-test.cc                  |   12 +-
 be/src/exec/hash-table.cc                       |    4 +-
 be/src/exec/hash-table.h                        |   14 +-
 be/src/exec/hash-table.inline.h                 |    2 +-
 be/src/exec/partitioned-aggregation-node-ir.cc  |    2 +-
 be/src/exec/partitioned-aggregation-node.cc     |   28 +-
 be/src/exec/partitioned-aggregation-node.h      |   12 +-
 be/src/exec/partitioned-hash-join-builder-ir.cc |   10 +-
 be/src/exec/partitioned-hash-join-builder.cc    |   34 +-
 be/src/exec/partitioned-hash-join-builder.h     |   24 +-
 be/src/exec/partitioned-hash-join-node-ir.cc    |    4 +-
 be/src/exec/partitioned-hash-join-node.cc       |   34 +-
 be/src/exec/partitioned-hash-join-node.h        |   14 +-
 be/src/exec/partitioned-hash-join-node.inline.h |    2 +-
 be/src/runtime/CMakeLists.txt                   |    4 +-
 be/src/runtime/buffered-tuple-stream-test.cc    | 1462 ++++++++++++++++++
 be/src/runtime/buffered-tuple-stream-v2-test.cc | 1462 ------------------
 be/src/runtime/buffered-tuple-stream-v2.cc      | 1084 -------------
 be/src/runtime/buffered-tuple-stream-v2.h       |  705 ---------
 .../runtime/buffered-tuple-stream-v2.inline.h   |   56 -
 be/src/runtime/buffered-tuple-stream.cc         | 1084 +++++++++++++
 be/src/runtime/buffered-tuple-stream.h          |  705 +++++++++
 be/src/runtime/buffered-tuple-stream.inline.h   |   56 +
 25 files changed, 3413 insertions(+), 3413 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/exec/analytic-eval-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/analytic-eval-node.cc b/be/src/exec/analytic-eval-node.cc
index f6d96ae..af4f866 100644
--- a/be/src/exec/analytic-eval-node.cc
+++ b/be/src/exec/analytic-eval-node.cc
@@ -23,7 +23,7 @@
 #include "exprs/agg-fn-evaluator.h"
 #include "exprs/scalar-expr.h"
 #include "exprs/scalar-expr-evaluator.h"
-#include "runtime/buffered-tuple-stream-v2.inline.h"
+#include "runtime/buffered-tuple-stream.inline.h"
 #include "runtime/descriptors.h"
 #include "runtime/mem-tracker.h"
 #include "runtime/query-state.h"
@@ -195,7 +195,7 @@ Status AnalyticEvalNode::Open(RuntimeState* state) {
     RETURN_IF_ERROR(ClaimBufferReservation(state));
   }
   DCHECK(input_stream_ == nullptr);
-  input_stream_.reset(new BufferedTupleStreamV2(state, child(0)->row_desc(),
+  input_stream_.reset(new BufferedTupleStream(state, child(0)->row_desc(),
       &buffer_pool_client_, resource_profile_.spillable_buffer_size,
       resource_profile_.spillable_buffer_size));
   RETURN_IF_ERROR(input_stream_->Init(id(), true));
@@ -363,7 +363,7 @@ inline Status AnalyticEvalNode::AddRow(int64_t stream_idx, TupleRow* row) {
     // TODO: Consider re-pinning later if the output stream is fully consumed.
     RETURN_IF_ERROR(status);
     RETURN_IF_ERROR(state_->StartSpilling(mem_tracker()));
-    input_stream_->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL_EXCEPT_CURRENT);
+    input_stream_->UnpinStream(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT);
     VLOG_FILE << id() << " Unpin input stream while adding row idx=" << stream_idx;
     if (!input_stream_->AddRow(row, &status)) {
       // Rows should be added in unpinned mode unless an error occurs.
@@ -623,7 +623,7 @@ Status AnalyticEvalNode::ProcessChildBatch(RuntimeState* state) {
             << " tuple pool size:" << curr_tuple_pool_->total_allocated_bytes();
   SCOPED_TIMER(evaluation_timer_);
 
-  // BufferedTupleStreamV2::num_rows() returns the total number of rows that have been
+  // BufferedTupleStream::num_rows() returns the total number of rows that have been
   // inserted into the stream (it does not decrease when we read rows), so the index of
   // the next input row that will be inserted will be the current size of the stream.
   int64_t stream_idx = input_stream_->num_rows();

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/exec/analytic-eval-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/analytic-eval-node.h b/be/src/exec/analytic-eval-node.h
index eab9198..671eaa4 100644
--- a/be/src/exec/analytic-eval-node.h
+++ b/be/src/exec/analytic-eval-node.h
@@ -19,7 +19,7 @@
 #define IMPALA_EXEC_ANALYTIC_EVAL_NODE_H
 
 #include "exec/exec-node.h"
-#include "runtime/buffered-tuple-stream-v2.h"
+#include "runtime/buffered-tuple-stream.h"
 #include "runtime/tuple.h"
 
 namespace impala {
@@ -339,7 +339,7 @@ class AnalyticEvalNode : public ExecNode {
   /// buffers with tuple data are attached to an output row batch on eos or
   /// ReachedLimit().
   /// TODO: Consider re-pinning unpinned streams when possible.
-  boost::scoped_ptr<BufferedTupleStreamV2> input_stream_;
+  boost::scoped_ptr<BufferedTupleStream> input_stream_;
 
   /// Pool used for O(1) allocations that live until Close() or Reset().
   /// Does not own data backing tuples returned in GetNext(), so it does not

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/exec/hash-table-test.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hash-table-test.cc b/be/src/exec/hash-table-test.cc
index 7a6ec9d..aad9134 100644
--- a/be/src/exec/hash-table-test.cc
+++ b/be/src/exec/hash-table-test.cc
@@ -309,7 +309,7 @@ class HashTableTest : public testing::Test {
 
     for (int i = 0; i < 2; ++i) {
       if (!ht_ctx->EvalAndHashBuild(build_rows[i])) continue;
-      BufferedTupleStreamV2::FlatRowPtr dummy_flat_row = nullptr;
+      BufferedTupleStream::FlatRowPtr dummy_flat_row = nullptr;
       EXPECT_TRUE(hash_table->stores_tuples_);
       Status status;
       bool inserted =
@@ -350,7 +350,7 @@ class HashTableTest : public testing::Test {
     ASSERT_TRUE(success);
     for (int i = 0; i < 5; ++i) {
       if (!ht_ctx->EvalAndHashBuild(build_rows[i])) continue;
-      BufferedTupleStreamV2::FlatRowPtr dummy_flat_row = nullptr;
+      BufferedTupleStream::FlatRowPtr dummy_flat_row = nullptr;
       EXPECT_TRUE(hash_table->stores_tuples_);
       bool inserted =
           hash_table->Insert(ht_ctx.get(), dummy_flat_row, build_rows[i], &status);
@@ -418,7 +418,7 @@ class HashTableTest : public testing::Test {
       for (int i = 0; i < val; ++i) {
         TupleRow* row = CreateTupleRow(val);
         if (!ht_ctx->EvalAndHashBuild(row)) continue;
-        BufferedTupleStreamV2::FlatRowPtr dummy_flat_row = nullptr;
+        BufferedTupleStream::FlatRowPtr dummy_flat_row = nullptr;
         EXPECT_TRUE(hash_table->stores_tuples_);
         ASSERT_TRUE(hash_table->Insert(ht_ctx.get(), dummy_flat_row, row, &status));
         ASSERT_OK(status);
@@ -481,7 +481,7 @@ class HashTableTest : public testing::Test {
       for (int j = 0; j < num_to_add; ++build_row_val, ++j) {
         TupleRow* row = CreateTupleRow(build_row_val);
         if (!ht_ctx->EvalAndHashBuild(row)) continue;
-        BufferedTupleStreamV2::FlatRowPtr dummy_flat_row = nullptr;
+        BufferedTupleStream::FlatRowPtr dummy_flat_row = nullptr;
         EXPECT_TRUE(hash_table->stores_tuples_);
         bool inserted = hash_table->Insert(ht_ctx.get(), dummy_flat_row, row, &status);
         ASSERT_OK(status);
@@ -518,7 +518,7 @@ class HashTableTest : public testing::Test {
     while (true) {
       TupleRow* duplicate_row = CreateTupleRow(DUPLICATE_VAL);
       if (!ht_ctx->EvalAndHashBuild(duplicate_row)) continue;
-      BufferedTupleStreamV2::FlatRowPtr dummy_flat_row = nullptr;
+      BufferedTupleStream::FlatRowPtr dummy_flat_row = nullptr;
       bool inserted =
           hash_table->Insert(ht_ctx.get(), dummy_flat_row, duplicate_row, &status);
       ASSERT_OK(status);
@@ -569,7 +569,7 @@ class HashTableTest : public testing::Test {
 
       // Insert using both Insert() and FindBucket() methods.
       if (build_row_val % 2 == 0) {
-        BufferedTupleStreamV2::FlatRowPtr dummy_flat_row = nullptr;
+        BufferedTupleStream::FlatRowPtr dummy_flat_row = nullptr;
         EXPECT_TRUE(hash_table->stores_tuples_);
         bool inserted = hash_table->Insert(ht_ctx.get(), dummy_flat_row, row, &status);
         EXPECT_TRUE(inserted);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/exec/hash-table.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hash-table.cc b/be/src/exec/hash-table.cc
index aacedc2..e65d9f1 100644
--- a/be/src/exec/hash-table.cc
+++ b/be/src/exec/hash-table.cc
@@ -385,14 +385,14 @@ constexpr double HashTable::MAX_FILL_FACTOR;
 constexpr int64_t HashTable::DATA_PAGE_SIZE;
 
 HashTable* HashTable::Create(Suballocator* allocator, bool stores_duplicates,
-    int num_build_tuples, BufferedTupleStreamV2* tuple_stream, int64_t max_num_buckets,
+    int num_build_tuples, BufferedTupleStream* tuple_stream, int64_t max_num_buckets,
     int64_t initial_num_buckets) {
   return new HashTable(FLAGS_enable_quadratic_probing, allocator, stores_duplicates,
       num_build_tuples, tuple_stream, max_num_buckets, initial_num_buckets);
 }
 
 HashTable::HashTable(bool quadratic_probing, Suballocator* allocator,
-    bool stores_duplicates, int num_build_tuples, BufferedTupleStreamV2* stream,
+    bool stores_duplicates, int num_build_tuples, BufferedTupleStream* stream,
     int64_t max_num_buckets, int64_t num_buckets)
   : allocator_(allocator),
     tuple_stream_(stream),

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/exec/hash-table.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hash-table.h b/be/src/exec/hash-table.h
index 297e619..d764640 100644
--- a/be/src/exec/hash-table.h
+++ b/be/src/exec/hash-table.h
@@ -26,8 +26,8 @@
 #include "codegen/impala-ir.h"
 #include "common/compiler-util.h"
 #include "common/logging.h"
-#include "runtime/buffered-tuple-stream-v2.h"
-#include "runtime/buffered-tuple-stream-v2.inline.h"
+#include "runtime/buffered-tuple-stream.h"
+#include "runtime/buffered-tuple-stream.inline.h"
 #include "runtime/bufferpool/buffer-pool.h"
 #include "runtime/bufferpool/suballocator.h"
 #include "runtime/tuple-row.h"
@@ -533,7 +533,7 @@ class HashTable {
   /// of two formats, depending on the number of tuples in the row.
   union HtData {
     // For rows with multiple tuples per row, a pointer to the flattened TupleRow.
-    BufferedTupleStreamV2::FlatRowPtr flat_row;
+    BufferedTupleStream::FlatRowPtr flat_row;
     // For rows with one tuple per row, a pointer to the Tuple itself.
     Tuple* tuple;
   };
@@ -600,7 +600,7 @@ class HashTable {
   ///  - initial_num_buckets: number of buckets that the hash table should be initialized
   ///    with.
   static HashTable* Create(Suballocator* allocator, bool stores_duplicates,
-      int num_build_tuples, BufferedTupleStreamV2* tuple_stream, int64_t max_num_buckets,
+      int num_build_tuples, BufferedTupleStream* tuple_stream, int64_t max_num_buckets,
       int64_t initial_num_buckets);
 
   /// Allocates the initial bucket structure. Returns a non-OK status if an error is
@@ -623,7 +623,7 @@ class HashTable {
   /// is stored. The 'row' is not copied by the hash table and the caller must guarantee
   /// it stays in memory. This will not grow the hash table.
   bool IR_ALWAYS_INLINE Insert(HashTableCtx* ht_ctx,
-      BufferedTupleStreamV2::FlatRowPtr flat_row, TupleRow* row,
+      BufferedTupleStream::FlatRowPtr flat_row, TupleRow* row,
       Status* status) WARN_UNUSED_RESULT;
 
   /// Prefetch the hash table bucket which the given hash value 'hash' maps to.
@@ -819,7 +819,7 @@ class HashTable {
   ///  - quadratic_probing: set to true when the probing algorithm is quadratic, as
   ///    opposed to linear.
   HashTable(bool quadratic_probing, Suballocator* allocator, bool stores_duplicates,
-      int num_build_tuples, BufferedTupleStreamV2* tuple_stream, int64_t max_num_buckets,
+      int num_build_tuples, BufferedTupleStream* tuple_stream, int64_t max_num_buckets,
       int64_t initial_num_buckets);
 
   /// Performs the probing operation according to the probing algorithm (linear or
@@ -918,7 +918,7 @@ class HashTable {
   /// Stream contains the rows referenced by the hash table. Can be NULL if the
   /// row only contains a single tuple, in which case the TupleRow indirection
   /// is removed by the hash table.
-  BufferedTupleStreamV2* tuple_stream_;
+  BufferedTupleStream* tuple_stream_;
 
   /// Constants on how the hash table should behave.
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/exec/hash-table.inline.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hash-table.inline.h b/be/src/exec/hash-table.inline.h
index ce2f784..85d7ad6 100644
--- a/be/src/exec/hash-table.inline.h
+++ b/be/src/exec/hash-table.inline.h
@@ -109,7 +109,7 @@ inline HashTable::HtData* HashTable::InsertInternal(
 }
 
 inline bool HashTable::Insert(HashTableCtx* ht_ctx,
-    BufferedTupleStreamV2::FlatRowPtr flat_row, TupleRow* row, Status* status) {
+    BufferedTupleStream::FlatRowPtr flat_row, TupleRow* row, Status* status) {
   HtData* htdata = InsertInternal(ht_ctx, status);
   // If successful insert, update the contents of the newly inserted entry with 'idx'.
   if (LIKELY(htdata != NULL)) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/exec/partitioned-aggregation-node-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-aggregation-node-ir.cc b/be/src/exec/partitioned-aggregation-node-ir.cc
index 126a2a5..9baada1 100644
--- a/be/src/exec/partitioned-aggregation-node-ir.cc
+++ b/be/src/exec/partitioned-aggregation-node-ir.cc
@@ -21,7 +21,7 @@
 #include "exprs/agg-fn-evaluator.h"
 #include "exprs/scalar-expr.h"
 #include "exprs/scalar-expr-evaluator.h"
-#include "runtime/buffered-tuple-stream-v2.inline.h"
+#include "runtime/buffered-tuple-stream.inline.h"
 #include "runtime/row-batch.h"
 #include "runtime/tuple-row.h"
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/exec/partitioned-aggregation-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-aggregation-node.cc b/be/src/exec/partitioned-aggregation-node.cc
index fc0a4a6..16db5cc 100644
--- a/be/src/exec/partitioned-aggregation-node.cc
+++ b/be/src/exec/partitioned-aggregation-node.cc
@@ -31,7 +31,7 @@
 #include "exprs/scalar-expr-evaluator.h"
 #include "exprs/slot-ref.h"
 #include "gutil/strings/substitute.h"
-#include "runtime/buffered-tuple-stream-v2.inline.h"
+#include "runtime/buffered-tuple-stream.inline.h"
 #include "runtime/descriptors.h"
 #include "runtime/exec-env.h"
 #include "runtime/mem-pool.h"
@@ -275,7 +275,7 @@ Status PartitionedAggregationNode::Open(RuntimeState* state) {
           &buffer_pool_client_, resource_profile_.spillable_buffer_size));
 
       if (!is_streaming_preagg_ && needs_serialize_) {
-        serialize_stream_.reset(new BufferedTupleStreamV2(state, &intermediate_row_desc_,
+        serialize_stream_.reset(new BufferedTupleStream(state, &intermediate_row_desc_,
             &buffer_pool_client_, resource_profile_.spillable_buffer_size,
             resource_profile_.spillable_buffer_size));
         RETURN_IF_ERROR(serialize_stream_->Init(id(), false));
@@ -722,7 +722,7 @@ Status PartitionedAggregationNode::Partition::InitStreams() {
     }
   }
 
-  aggregated_row_stream.reset(new BufferedTupleStreamV2(parent->state_,
+  aggregated_row_stream.reset(new BufferedTupleStream(parent->state_,
       &parent->intermediate_row_desc_, &parent->buffer_pool_client_,
       parent->resource_profile_.spillable_buffer_size,
       parent->resource_profile_.spillable_buffer_size, external_varlen_slots));
@@ -740,7 +740,7 @@ Status PartitionedAggregationNode::Partition::InitStreams() {
   }
 
   if (!parent->is_streaming_preagg_) {
-    unaggregated_row_stream.reset(new BufferedTupleStreamV2(parent->state_,
+    unaggregated_row_stream.reset(new BufferedTupleStream(parent->state_,
         parent->child(0)->row_desc(), &parent->buffer_pool_client_,
         parent->resource_profile_.spillable_buffer_size,
         parent->resource_profile_.spillable_buffer_size));
@@ -786,7 +786,7 @@ Status PartitionedAggregationNode::Partition::SerializeStreamForSpilling() {
 
     // Serialize and copy the spilled partition's stream into the new stream.
     Status status;
-    BufferedTupleStreamV2* new_stream = parent->serialize_stream_.get();
+    BufferedTupleStream* new_stream = parent->serialize_stream_.get();
     HashTable::Iterator it = hash_tbl->Begin(parent->ht_ctx_.get());
     while (!it.AtEnd()) {
       Tuple* tuple = it.GetTuple();
@@ -811,7 +811,7 @@ Status PartitionedAggregationNode::Partition::SerializeStreamForSpilling() {
     // when we need to spill again. We need to have this available before we need
     // to spill to make sure it is available. This should be acquirable since we just
     // freed at least one buffer from this partition's (old) aggregated_row_stream.
-    parent->serialize_stream_.reset(new BufferedTupleStreamV2(parent->state_,
+    parent->serialize_stream_.reset(new BufferedTupleStream(parent->state_,
         &parent->intermediate_row_desc_, &parent->buffer_pool_client_,
         parent->resource_profile_.spillable_buffer_size,
         parent->resource_profile_.spillable_buffer_size));
@@ -866,9 +866,9 @@ Status PartitionedAggregationNode::Partition::Spill(bool more_aggregate_rows) {
   DCHECK(aggregated_row_stream->has_write_iterator());
   DCHECK(!unaggregated_row_stream->has_write_iterator());
   if (more_aggregate_rows) {
-    aggregated_row_stream->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL_EXCEPT_CURRENT);
+    aggregated_row_stream->UnpinStream(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT);
   } else {
-    aggregated_row_stream->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
+    aggregated_row_stream->UnpinStream(BufferedTupleStream::UNPIN_ALL);
     bool got_buffer;
     RETURN_IF_ERROR(unaggregated_row_stream->PrepareForWrite(&got_buffer));
     DCHECK(got_buffer)
@@ -932,7 +932,7 @@ Tuple* PartitionedAggregationNode::ConstructIntermediateTuple(
 }
 
 Tuple* PartitionedAggregationNode::ConstructIntermediateTuple(
-    const vector<AggFnEvaluator*>& agg_fn_evals, BufferedTupleStreamV2* stream,
+    const vector<AggFnEvaluator*>& agg_fn_evals, BufferedTupleStream* stream,
     Status* status) noexcept {
   DCHECK(stream != NULL && status != NULL);
   // Allocate space for the entire tuple in the stream.
@@ -1077,7 +1077,7 @@ Status PartitionedAggregationNode::AppendSpilledRow(
     Partition* __restrict__ partition, TupleRow* __restrict__ row) {
   DCHECK(!is_streaming_preagg_);
   DCHECK(partition->is_spilled());
-  BufferedTupleStreamV2* stream = AGGREGATED_ROWS ?
+  BufferedTupleStream* stream = AGGREGATED_ROWS ?
       partition->aggregated_row_stream.get() :
       partition->unaggregated_row_stream.get();
   DCHECK(!stream->is_pinned());
@@ -1297,7 +1297,7 @@ Status PartitionedAggregationNode::RepartitionSpilledPartition() {
     if (!hash_partition->is_spilled()) continue;
     // The aggregated rows have been repartitioned. Free up at least a buffer's worth of
     // reservation and use it to pin the unaggregated write buffer.
-    hash_partition->aggregated_row_stream->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
+    hash_partition->aggregated_row_stream->UnpinStream(BufferedTupleStream::UNPIN_ALL);
     bool got_buffer;
     RETURN_IF_ERROR(
         hash_partition->unaggregated_row_stream->PrepareForWrite(&got_buffer));
@@ -1332,7 +1332,7 @@ Status PartitionedAggregationNode::RepartitionSpilledPartition() {
 }
 
 template <bool AGGREGATED_ROWS>
-Status PartitionedAggregationNode::ProcessStream(BufferedTupleStreamV2* input_stream) {
+Status PartitionedAggregationNode::ProcessStream(BufferedTupleStream* input_stream) {
   DCHECK(!is_streaming_preagg_);
   if (input_stream->num_rows() > 0) {
     while (true) {
@@ -1430,8 +1430,8 @@ void PartitionedAggregationNode::PushSpilledPartition(Partition* partition) {
   // Ensure all pages in the spilled partition's streams are unpinned by invalidating
   // the streams' read and write iterators. We may need all the memory to process the
   // next spilled partitions.
-  partition->aggregated_row_stream->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
-  partition->unaggregated_row_stream->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
+  partition->aggregated_row_stream->UnpinStream(BufferedTupleStream::UNPIN_ALL);
+  partition->unaggregated_row_stream->UnpinStream(BufferedTupleStream::UNPIN_ALL);
   spilled_partitions_.push_front(partition);
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/exec/partitioned-aggregation-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-aggregation-node.h b/be/src/exec/partitioned-aggregation-node.h
index 4f8b622..c230630 100644
--- a/be/src/exec/partitioned-aggregation-node.h
+++ b/be/src/exec/partitioned-aggregation-node.h
@@ -25,7 +25,7 @@
 
 #include "exec/exec-node.h"
 #include "exec/hash-table.h"
-#include "runtime/buffered-tuple-stream-v2.h"
+#include "runtime/buffered-tuple-stream.h"
 #include "runtime/bufferpool/suballocator.h"
 #include "runtime/descriptors.h" // for TupleId
 #include "runtime/mem-pool.h"
@@ -425,18 +425,18 @@ class PartitionedAggregationNode : public ExecNode {
     /// For streaming preaggs, this may be NULL if sufficient memory is not available.
     /// In that case hash_tbl is also NULL and all rows for the partition will be passed
     /// through.
-    boost::scoped_ptr<BufferedTupleStreamV2> aggregated_row_stream;
+    boost::scoped_ptr<BufferedTupleStream> aggregated_row_stream;
 
     /// Unaggregated rows that are spilled. Always NULL for streaming pre-aggregations.
     /// Always unpinned. Has a write buffer allocated when the partition is spilled and
     /// unaggregated rows are being processed.
-    boost::scoped_ptr<BufferedTupleStreamV2> unaggregated_row_stream;
+    boost::scoped_ptr<BufferedTupleStream> unaggregated_row_stream;
   };
 
   /// Stream used to store serialized spilled rows. Only used if needs_serialize_
   /// is set. This stream is never pinned and only used in Partition::Spill as a
   /// a temporary buffer.
-  boost::scoped_ptr<BufferedTupleStreamV2> serialize_stream_;
+  boost::scoped_ptr<BufferedTupleStream> serialize_stream_;
 
   /// Accessor for 'hash_tbls_' that verifies consistency with the partitions.
   HashTable* ALWAYS_INLINE GetHashTable(int partition_idx) {
@@ -471,7 +471,7 @@ class PartitionedAggregationNode : public ExecNode {
   /// FunctionContexts, so is stored outside the stream. If stream's small buffers get
   /// full, it will attempt to switch to IO-buffers.
   Tuple* ConstructIntermediateTuple(const std::vector<AggFnEvaluator*>& agg_fn_evals,
-      BufferedTupleStreamV2* stream, Status* status) noexcept;
+      BufferedTupleStream* stream, Status* status) noexcept;
 
   /// Constructs intermediate tuple, allocating memory from pool instead of the stream.
   /// Returns NULL and sets status if there is not enough memory to allocate the tuple.
@@ -571,7 +571,7 @@ class PartitionedAggregationNode : public ExecNode {
 
   /// Reads all the rows from input_stream and process them by calling ProcessBatch().
   template <bool AGGREGATED_ROWS>
-  Status ProcessStream(BufferedTupleStreamV2* input_stream) WARN_UNUSED_RESULT;
+  Status ProcessStream(BufferedTupleStream* input_stream) WARN_UNUSED_RESULT;
 
   /// Output 'singleton_output_tuple_' and transfer memory to 'row_batch'.
   void GetSingletonOutput(RowBatch* row_batch);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/exec/partitioned-hash-join-builder-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-builder-ir.cc b/be/src/exec/partitioned-hash-join-builder-ir.cc
index df58036..e15e116 100644
--- a/be/src/exec/partitioned-hash-join-builder-ir.cc
+++ b/be/src/exec/partitioned-hash-join-builder-ir.cc
@@ -19,7 +19,7 @@
 
 #include "codegen/impala-ir.h"
 #include "exec/hash-table.inline.h"
-#include "runtime/buffered-tuple-stream-v2.inline.h"
+#include "runtime/buffered-tuple-stream.inline.h"
 #include "runtime/raw-value.inline.h"
 #include "runtime/row-batch.h"
 #include "runtime/runtime-filter.h"
@@ -30,7 +30,7 @@
 using namespace impala;
 
 inline bool PhjBuilder::AppendRow(
-    BufferedTupleStreamV2* stream, TupleRow* row, Status* status) {
+    BufferedTupleStream* stream, TupleRow* row, Status* status) {
   if (LIKELY(stream->AddRow(row, status))) return true;
   if (UNLIKELY(!status->ok())) return false;
   return AppendRowStreamFull(stream, row, status);
@@ -73,12 +73,12 @@ Status PhjBuilder::ProcessBuildBatch(
 
 bool PhjBuilder::Partition::InsertBatch(TPrefetchMode::type prefetch_mode,
     HashTableCtx* ht_ctx, RowBatch* batch,
-    const vector<BufferedTupleStreamV2::FlatRowPtr>& flat_rows, Status* status) {
+    const vector<BufferedTupleStream::FlatRowPtr>& flat_rows, Status* status) {
   // Compute the hash values and prefetch the hash table buckets.
   const int num_rows = batch->num_rows();
   HashTableCtx::ExprValuesCache* expr_vals_cache = ht_ctx->expr_values_cache();
   const int prefetch_size = expr_vals_cache->capacity();
-  const BufferedTupleStreamV2::FlatRowPtr* flat_rows_data = flat_rows.data();
+  const BufferedTupleStream::FlatRowPtr* flat_rows_data = flat_rows.data();
   for (int prefetch_group_row = 0; prefetch_group_row < num_rows;
        prefetch_group_row += prefetch_size) {
     int cur_row = prefetch_group_row;
@@ -97,7 +97,7 @@ bool PhjBuilder::Partition::InsertBatch(TPrefetchMode::type prefetch_mode,
     expr_vals_cache->ResetForRead();
     FOREACH_ROW_LIMIT(batch, cur_row, prefetch_size, batch_iter) {
       TupleRow* row = batch_iter.Get();
-      BufferedTupleStreamV2::FlatRowPtr flat_row = flat_rows_data[cur_row];
+      BufferedTupleStream::FlatRowPtr flat_row = flat_rows_data[cur_row];
       if (!expr_vals_cache->IsRowNull()
           && UNLIKELY(!hash_tbl_->Insert(ht_ctx, flat_row, row, status))) {
         return false;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/exec/partitioned-hash-join-builder.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-builder.cc b/be/src/exec/partitioned-hash-join-builder.cc
index a2f7c96..2dc2d8d 100644
--- a/be/src/exec/partitioned-hash-join-builder.cc
+++ b/be/src/exec/partitioned-hash-join-builder.cc
@@ -25,7 +25,7 @@
 #include "exec/hash-table.inline.h"
 #include "exprs/scalar-expr.h"
 #include "exprs/scalar-expr-evaluator.h"
-#include "runtime/buffered-tuple-stream-v2.h"
+#include "runtime/buffered-tuple-stream.h"
 #include "runtime/exec-env.h"
 #include "runtime/mem-tracker.h"
 #include "runtime/query-state.h"
@@ -293,11 +293,11 @@ Status PhjBuilder::CreateHashPartitions(int level) {
 }
 
 bool PhjBuilder::AppendRowStreamFull(
-    BufferedTupleStreamV2* stream, TupleRow* row, Status* status) noexcept {
+    BufferedTupleStream* stream, TupleRow* row, Status* status) noexcept {
   while (true) {
     // We ran out of memory. Pick a partition to spill. If we ran out of unspilled
     // partitions, SpillPartition() will return an error status.
-    *status = SpillPartition(BufferedTupleStreamV2::UNPIN_ALL_EXCEPT_CURRENT);
+    *status = SpillPartition(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT);
     if (!status->ok()) return false;
     if (stream->AddRow(row, status)) return true;
     if (!status->ok()) return false;
@@ -307,7 +307,7 @@ bool PhjBuilder::AppendRowStreamFull(
 }
 
 // TODO: can we do better with a different spilling heuristic?
-Status PhjBuilder::SpillPartition(BufferedTupleStreamV2::UnpinMode mode) {
+Status PhjBuilder::SpillPartition(BufferedTupleStream::UnpinMode mode) {
   DCHECK_EQ(hash_partitions_.size(), PARTITION_FANOUT);
   int64_t max_freed_mem = 0;
   int partition_idx = -1;
@@ -367,7 +367,7 @@ Status PhjBuilder::BuildHashTablesAndPrepareProbeStreams() {
       partition->Close(NULL);
     } else if (partition->is_spilled()) {
       // We don't need any build-side data for spilled partitions in memory.
-      partition->build_rows()->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
+      partition->build_rows()->UnpinStream(BufferedTupleStream::UNPIN_ALL);
     }
   }
 
@@ -386,7 +386,7 @@ Status PhjBuilder::BuildHashTablesAndPrepareProbeStreams() {
     RETURN_IF_ERROR(partition->BuildHashTable(&built));
     // If we did not have enough memory to build this hash table, we need to spill this
     // partition (clean up the hash table, unpin build).
-    if (!built) RETURN_IF_ERROR(partition->Spill(BufferedTupleStreamV2::UNPIN_ALL));
+    if (!built) RETURN_IF_ERROR(partition->Spill(BufferedTupleStream::UNPIN_ALL));
   }
 
   // We may have spilled additional partitions while building hash tables, we need to
@@ -423,9 +423,9 @@ Status PhjBuilder::InitSpilledPartitionProbeStreams() {
   while (probe_streams_to_create > 0) {
     // Create stream in vector, so that it will be cleaned up after any failure.
     spilled_partition_probe_streams_.emplace_back(
-        make_unique<BufferedTupleStreamV2>(runtime_state_, probe_row_desc_,
+        make_unique<BufferedTupleStream>(runtime_state_, probe_row_desc_,
             buffer_pool_client_, spillable_buffer_size_, spillable_buffer_size_));
-    BufferedTupleStreamV2* probe_stream = spilled_partition_probe_streams_.back().get();
+    BufferedTupleStream* probe_stream = spilled_partition_probe_streams_.back().get();
     RETURN_IF_ERROR(probe_stream->Init(join_node_id_, false));
 
     // Loop until either the stream gets a buffer or all partitions are spilled (in which
@@ -435,7 +435,7 @@ Status PhjBuilder::InitSpilledPartitionProbeStreams() {
       RETURN_IF_ERROR(probe_stream->PrepareForWrite(&got_buffer));
       if (got_buffer) break;
 
-      RETURN_IF_ERROR(SpillPartition(BufferedTupleStreamV2::UNPIN_ALL));
+      RETURN_IF_ERROR(SpillPartition(BufferedTupleStream::UNPIN_ALL));
       ++probe_streams_to_create;
     }
     --probe_streams_to_create;
@@ -443,7 +443,7 @@ Status PhjBuilder::InitSpilledPartitionProbeStreams() {
   return Status::OK();
 }
 
-vector<unique_ptr<BufferedTupleStreamV2>> PhjBuilder::TransferProbeStreams() {
+vector<unique_ptr<BufferedTupleStream>> PhjBuilder::TransferProbeStreams() {
   return std::move(spilled_partition_probe_streams_);
 }
 
@@ -453,7 +453,7 @@ void PhjBuilder::CloseAndDeletePartitions() {
   all_partitions_.clear();
   hash_partitions_.clear();
   null_aware_partition_ = NULL;
-  for (unique_ptr<BufferedTupleStreamV2>& stream : spilled_partition_probe_streams_) {
+  for (unique_ptr<BufferedTupleStream>& stream : spilled_partition_probe_streams_) {
     stream->Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
   }
   spilled_partition_probe_streams_.clear();
@@ -505,14 +505,14 @@ void PhjBuilder::PublishRuntimeFilters(int64_t num_build_rows) {
 }
 
 Status PhjBuilder::RepartitionBuildInput(
-    Partition* input_partition, int level, BufferedTupleStreamV2* input_probe_rows) {
+    Partition* input_partition, int level, BufferedTupleStream* input_probe_rows) {
   DCHECK_GE(level, 1);
   SCOPED_TIMER(repartition_timer_);
   COUNTER_ADD(num_repartitions_, 1);
   RuntimeState* state = runtime_state_;
 
   // Setup the read buffer and the new partitions.
-  BufferedTupleStreamV2* build_rows = input_partition->build_rows();
+  BufferedTupleStream* build_rows = input_partition->build_rows();
   DCHECK(build_rows != NULL);
   bool got_read_buffer;
   RETURN_IF_ERROR(build_rows->PrepareForRead(true, &got_read_buffer));
@@ -545,7 +545,7 @@ Status PhjBuilder::RepartitionBuildInput(
     bool got_buffer;
     RETURN_IF_ERROR(input_probe_rows->PrepareForRead(true, &got_buffer));
     if (got_buffer) break;
-    RETURN_IF_ERROR(SpillPartition(BufferedTupleStreamV2::UNPIN_ALL_EXCEPT_CURRENT));
+    RETURN_IF_ERROR(SpillPartition(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT));
   }
 
   RETURN_IF_ERROR(FlushFinal(state));
@@ -573,7 +573,7 @@ bool PhjBuilder::HashTableStoresNulls() const {
 
 PhjBuilder::Partition::Partition(RuntimeState* state, PhjBuilder* parent, int level)
   : parent_(parent), is_spilled_(false), level_(level) {
-  build_rows_ = make_unique<BufferedTupleStreamV2>(state, parent_->row_desc_,
+  build_rows_ = make_unique<BufferedTupleStream>(state, parent_->row_desc_,
       parent_->buffer_pool_client_, parent->spillable_buffer_size_,
       parent->spillable_buffer_size_);
 }
@@ -602,7 +602,7 @@ void PhjBuilder::Partition::Close(RowBatch* batch) {
   }
 }
 
-Status PhjBuilder::Partition::Spill(BufferedTupleStreamV2::UnpinMode mode) {
+Status PhjBuilder::Partition::Spill(BufferedTupleStream::UnpinMode mode) {
   DCHECK(!IsClosed());
   RETURN_IF_ERROR(parent_->runtime_state_->StartSpilling(parent_->mem_tracker()));
   // Close the hash table and unpin the stream backing it to free memory.
@@ -634,7 +634,7 @@ Status PhjBuilder::Partition::BuildHashTable(bool* built) {
   HashTableCtx* ctx = parent_->ht_ctx_.get();
   ctx->set_level(level()); // Set the hash function for building the hash table.
   RowBatch batch(parent_->row_desc_, state->batch_size(), parent_->mem_tracker());
-  vector<BufferedTupleStreamV2::FlatRowPtr> flat_rows;
+  vector<BufferedTupleStream::FlatRowPtr> flat_rows;
   bool eos = false;
 
   // Allocate the partition-local hash table. Initialize the number of buckets based on

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/exec/partitioned-hash-join-builder.h
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-builder.h b/be/src/exec/partitioned-hash-join-builder.h
index 912613d..277579c 100644
--- a/be/src/exec/partitioned-hash-join-builder.h
+++ b/be/src/exec/partitioned-hash-join-builder.h
@@ -26,7 +26,7 @@
 #include "exec/data-sink.h"
 #include "exec/filter-context.h"
 #include "exec/hash-table.h"
-#include "runtime/buffered-tuple-stream-v2.h"
+#include "runtime/buffered-tuple-stream.h"
 #include "runtime/bufferpool/buffer-pool.h"
 #include "runtime/bufferpool/suballocator.h"
 
@@ -103,7 +103,7 @@ class PhjBuilder : public DataSink {
   /// Transfer ownership of the probe streams to the caller. One stream was allocated per
   /// spilled partition in FlushFinal(). The probe streams are empty but prepared for
   /// writing with a write buffer allocated.
-  std::vector<std::unique_ptr<BufferedTupleStreamV2>> TransferProbeStreams();
+  std::vector<std::unique_ptr<BufferedTupleStream>> TransferProbeStreams();
 
   /// Clears the current list of hash partitions. Called after probing of the partitions
   /// is done. The partitions are not closed or destroyed, since they may be spilled or
@@ -124,7 +124,7 @@ class PhjBuilder : public DataSink {
   /// 'input_probe_rows' for reading in "delete_on_read" mode, so that the probe phase
   /// has enough buffers preallocated to execute successfully.
   Status RepartitionBuildInput(Partition* input_partition, int level,
-      BufferedTupleStreamV2* input_probe_rows) WARN_UNUSED_RESULT;
+      BufferedTupleStream* input_probe_rows) WARN_UNUSED_RESULT;
 
   /// Returns the largest build row count out of the current hash partitions.
   int64_t LargestPartitionRows() const;
@@ -201,10 +201,10 @@ class PhjBuilder : public DataSink {
 
     /// Spills this partition, the partition's stream is unpinned with 'mode' and
     /// its hash table is destroyed if it was built.
-    Status Spill(BufferedTupleStreamV2::UnpinMode mode) WARN_UNUSED_RESULT;
+    Status Spill(BufferedTupleStream::UnpinMode mode) WARN_UNUSED_RESULT;
 
     bool ALWAYS_INLINE IsClosed() const { return build_rows_ == NULL; }
-    BufferedTupleStreamV2* ALWAYS_INLINE build_rows() { return build_rows_.get(); }
+    BufferedTupleStream* ALWAYS_INLINE build_rows() { return build_rows_.get(); }
     HashTable* ALWAYS_INLINE hash_tbl() const { return hash_tbl_.get(); }
     bool ALWAYS_INLINE is_spilled() const { return is_spilled_; }
     int ALWAYS_INLINE level() const { return level_; }
@@ -220,7 +220,7 @@ class PhjBuilder : public DataSink {
     /// failed: if 'status' is ok, inserting failed because not enough reservation
     /// was available and if 'status' is an error, inserting failed because of that error.
     bool InsertBatch(TPrefetchMode::type prefetch_mode, HashTableCtx* ctx,
-        RowBatch* batch, const std::vector<BufferedTupleStreamV2::FlatRowPtr>& flat_rows,
+        RowBatch* batch, const std::vector<BufferedTupleStream::FlatRowPtr>& flat_rows,
         Status* status);
 
     const PhjBuilder* parent_;
@@ -239,7 +239,7 @@ class PhjBuilder : public DataSink {
     /// Stream of build tuples in this partition. Initially owned by this object but
     /// transferred to the parent exec node (via the row batch) when the partition
     /// is closed. If NULL, ownership has been transferred and the partition is closed.
-    std::unique_ptr<BufferedTupleStreamV2> build_rows_;
+    std::unique_ptr<BufferedTupleStream> build_rows_;
   };
 
   /// Computes the minimum number of buffers required to execute the spilling partitioned
@@ -288,19 +288,19 @@ class PhjBuilder : public DataSink {
   /// partitions. This odd return convention is used to avoid emitting unnecessary code
   /// for ~Status in perf-critical code.
   bool AppendRow(
-      BufferedTupleStreamV2* stream, TupleRow* row, Status* status) WARN_UNUSED_RESULT;
+      BufferedTupleStream* stream, TupleRow* row, Status* status) WARN_UNUSED_RESULT;
 
   /// Slow path for AppendRow() above. It is called when the stream has failed to append
   /// the row. We need to find more memory by either switching to IO-buffers, in case the
   /// stream still uses small buffers, or spilling a partition. Returns false and sets
   /// 'status' if it was unable to append the row, even after spilling partitions.
-  bool AppendRowStreamFull(BufferedTupleStreamV2* stream, TupleRow* row,
+  bool AppendRowStreamFull(BufferedTupleStream* stream, TupleRow* row,
       Status* status) noexcept WARN_UNUSED_RESULT;
 
   /// Frees memory by spilling one of the hash partitions. The 'mode' argument is passed
   /// to the Spill() call for the selected partition. The current policy is to spill the
   /// largest partition. Returns non-ok status if we couldn't spill a partition.
-  Status SpillPartition(BufferedTupleStreamV2::UnpinMode mode) WARN_UNUSED_RESULT;
+  Status SpillPartition(BufferedTupleStream::UnpinMode mode) WARN_UNUSED_RESULT;
 
   /// Tries to build hash tables for all unspilled hash partitions. Called after
   /// FlushFinal() when all build rows have been partitioned and added to the appropriate
@@ -464,7 +464,7 @@ class PhjBuilder : public DataSink {
   ///
   /// Because of this, at the end of the build phase, we always have sufficient memory
   /// to execute the probe phase of the algorithm without spilling more partitions.
-  std::vector<std::unique_ptr<BufferedTupleStreamV2>> spilled_partition_probe_streams_;
+  std::vector<std::unique_ptr<BufferedTupleStream>> spilled_partition_probe_streams_;
 
   /// END: Members that must be Reset()
   /////////////////////////////////////////
@@ -479,7 +479,7 @@ class PhjBuilder : public DataSink {
   ProcessBuildBatchFn process_build_batch_fn_level0_;
 
   typedef bool (*InsertBatchFn)(Partition*, TPrefetchMode::type, HashTableCtx*, RowBatch*,
-      const std::vector<BufferedTupleStreamV2::FlatRowPtr>&, Status*);
+      const std::vector<BufferedTupleStream::FlatRowPtr>&, Status*);
   /// Jitted Partition::InsertBatch() function pointers. NULL if codegen is disabled.
   InsertBatchFn insert_batch_fn_;
   InsertBatchFn insert_batch_fn_level0_;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/exec/partitioned-hash-join-node-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-node-ir.cc b/be/src/exec/partitioned-hash-join-node-ir.cc
index b890eb9..3106419 100644
--- a/be/src/exec/partitioned-hash-join-node-ir.cc
+++ b/be/src/exec/partitioned-hash-join-node-ir.cc
@@ -313,7 +313,7 @@ bool IR_ALWAYS_INLINE PartitionedHashJoinNode::NextProbeRow(
           // The partition is not in memory, spill the probe row and move to the next row.
           // Skip the current row if we manage to append to the spilled partition's BTS.
           // Otherwise, we need to bail out and report the failure.
-          BufferedTupleStreamV2* probe_rows = probe_partition->probe_rows();
+          BufferedTupleStream* probe_rows = probe_partition->probe_rows();
           if (UNLIKELY(!AppendProbeRow(probe_rows, current_probe_row_, status))) {
             DCHECK(!status->ok());
             return false;
@@ -438,7 +438,7 @@ int PartitionedHashJoinNode::ProcessProbeBatch(TPrefetchMode::type prefetch_mode
 }
 
 inline bool PartitionedHashJoinNode::AppendProbeRow(
-    BufferedTupleStreamV2* stream, TupleRow* row, Status* status) {
+    BufferedTupleStream* stream, TupleRow* row, Status* status) {
   DCHECK(stream->has_write_iterator());
   DCHECK(!stream->is_pinned());
   return stream->AddRow(row, status);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/exec/partitioned-hash-join-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-node.cc b/be/src/exec/partitioned-hash-join-node.cc
index 2db9e00..806bdc0 100644
--- a/be/src/exec/partitioned-hash-join-node.cc
+++ b/be/src/exec/partitioned-hash-join-node.cc
@@ -27,7 +27,7 @@
 #include "exprs/scalar-expr.h"
 #include "exprs/scalar-expr-evaluator.h"
 #include "exprs/slot-ref.h"
-#include "runtime/buffered-tuple-stream-v2.inline.h"
+#include "runtime/buffered-tuple-stream.inline.h"
 #include "runtime/mem-tracker.h"
 #include "runtime/row-batch.h"
 #include "runtime/runtime-state.h"
@@ -265,7 +265,7 @@ void PartitionedHashJoinNode::Close(RuntimeState* state) {
 
 PartitionedHashJoinNode::ProbePartition::ProbePartition(RuntimeState* state,
     PartitionedHashJoinNode* parent, PhjBuilder::Partition* build_partition,
-    unique_ptr<BufferedTupleStreamV2> probe_rows)
+    unique_ptr<BufferedTupleStream> probe_rows)
   : build_partition_(build_partition),
     probe_rows_(std::move(probe_rows)) {
   DCHECK(probe_rows_->has_write_iterator());
@@ -328,7 +328,7 @@ Status PartitionedHashJoinNode::NextSpilledProbeRowBatch(
     probe_batch_pos_ = -1;
     return Status::OK();
   }
-  BufferedTupleStreamV2* probe_rows = input_partition_->probe_rows();
+  BufferedTupleStream* probe_rows = input_partition_->probe_rows();
   if (LIKELY(probe_rows->rows_returned() < probe_rows->num_rows())) {
     // Continue from the current probe stream.
     bool eos = false;
@@ -420,9 +420,9 @@ Status PartitionedHashJoinNode::PrepareSpilledPartitionForProbe(
     ht_ctx_->set_level(next_partition_level);
 
     // Spill to free memory from hash tables and pinned streams for use in new partitions.
-    RETURN_IF_ERROR(build_partition->Spill(BufferedTupleStreamV2::UNPIN_ALL));
+    RETURN_IF_ERROR(build_partition->Spill(BufferedTupleStream::UNPIN_ALL));
     // Temporarily free up the probe buffer to use when repartitioning.
-    input_partition_->probe_rows()->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
+    input_partition_->probe_rows()->UnpinStream(BufferedTupleStream::UNPIN_ALL);
     DCHECK_EQ(build_partition->build_rows()->BytesPinned(false), 0) << NodeDebugString();
     DCHECK_EQ(input_partition_->probe_rows()->BytesPinned(false), 0) << NodeDebugString();
     int64_t num_input_rows = build_partition->build_rows()->num_rows();
@@ -822,7 +822,7 @@ static Status NullAwareAntiJoinError(bool build) {
 
 Status PartitionedHashJoinNode::InitNullAwareProbePartition() {
   RuntimeState* state = runtime_state_;
-  unique_ptr<BufferedTupleStreamV2> probe_rows = make_unique<BufferedTupleStreamV2>(
+  unique_ptr<BufferedTupleStream> probe_rows = make_unique<BufferedTupleStream>(
       state, child(0)->row_desc(), &buffer_pool_client_,
       resource_profile_.spillable_buffer_size,
       resource_profile_.spillable_buffer_size);
@@ -847,7 +847,7 @@ error:
 
 Status PartitionedHashJoinNode::InitNullProbeRows() {
   RuntimeState* state = runtime_state_;
-  null_probe_rows_ = make_unique<BufferedTupleStreamV2>(state, child(0)->row_desc(),
+  null_probe_rows_ = make_unique<BufferedTupleStream>(state, child(0)->row_desc(),
       &buffer_pool_client_, resource_profile_.spillable_buffer_size,
       resource_profile_.spillable_buffer_size);
   // TODO: we shouldn't start with this unpinned if spilling is disabled.
@@ -866,8 +866,8 @@ Status PartitionedHashJoinNode::PrepareNullAwarePartition() {
   DCHECK_EQ(probe_batch_pos_, -1);
   DCHECK_EQ(probe_batch_->num_rows(), 0);
 
-  BufferedTupleStreamV2* build_stream = builder_->null_aware_partition()->build_rows();
-  BufferedTupleStreamV2* probe_stream = null_aware_probe_partition_->probe_rows();
+  BufferedTupleStream* build_stream = builder_->null_aware_partition()->build_rows();
+  BufferedTupleStream* probe_stream = null_aware_probe_partition_->probe_rows();
 
   if (build_stream->num_rows() == 0) {
     // There were no build rows. Nothing to do. Just prepare to output the null
@@ -904,7 +904,7 @@ Status PartitionedHashJoinNode::OutputNullAwareProbeRows(RuntimeState* state,
   int num_join_conjuncts = other_join_conjuncts_.size();
   DCHECK(probe_batch_ != NULL);
 
-  BufferedTupleStreamV2* probe_stream = null_aware_probe_partition_->probe_rows();
+  BufferedTupleStream* probe_stream = null_aware_probe_partition_->probe_rows();
   if (probe_batch_pos_ == probe_batch_->num_rows()) {
     probe_batch_pos_ = 0;
     probe_batch_->TransferResourceOwnership(out_batch);
@@ -952,7 +952,7 @@ Status PartitionedHashJoinNode::PrepareForProbe() {
   DCHECK(probe_hash_partitions_.empty());
 
   // Initialize the probe partitions, providing them with probe streams.
-  vector<unique_ptr<BufferedTupleStreamV2>> probe_streams =
+  vector<unique_ptr<BufferedTupleStream>> probe_streams =
       builder_->TransferProbeStreams();
   probe_hash_partitions_.resize(PARTITION_FANOUT);
   for (int i = 0; i < PARTITION_FANOUT; ++i) {
@@ -989,7 +989,7 @@ Status PartitionedHashJoinNode::PrepareForProbe() {
 }
 
 void PartitionedHashJoinNode::CreateProbePartition(
-    int partition_idx, unique_ptr<BufferedTupleStreamV2> probe_rows) {
+    int partition_idx, unique_ptr<BufferedTupleStream> probe_rows) {
   DCHECK_GE(partition_idx, 0);
   DCHECK_LT(partition_idx, probe_hash_partitions_.size());
   DCHECK(probe_hash_partitions_[partition_idx] == NULL);
@@ -998,7 +998,7 @@ void PartitionedHashJoinNode::CreateProbePartition(
 }
 
 Status PartitionedHashJoinNode::EvaluateNullProbe(
-    RuntimeState* state, BufferedTupleStreamV2* build) {
+    RuntimeState* state, BufferedTupleStream* build) {
   if (null_probe_rows_ == NULL || null_probe_rows_->num_rows() == 0) {
     return Status::OK();
   }
@@ -1067,9 +1067,9 @@ Status PartitionedHashJoinNode::CleanUpHashPartitions(
       // can recurse the algorithm and create new hash partitions from spilled partitions.
       // TODO: we shouldn't need to unpin the build stream if we stop spilling
       // while probing.
-      build_partition->build_rows()->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
+      build_partition->build_rows()->UnpinStream(BufferedTupleStream::UNPIN_ALL);
       DCHECK_EQ(build_partition->build_rows()->BytesPinned(false), 0);
-      probe_partition->probe_rows()->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
+      probe_partition->probe_rows()->UnpinStream(BufferedTupleStream::UNPIN_ALL);
 
       if (probe_partition->probe_rows()->num_rows() != 0
           || NeedToProcessUnmatchedBuildRows()) {
@@ -1108,7 +1108,7 @@ Status PartitionedHashJoinNode::CleanUpHashPartitions(
   // Just finished evaluating the null probe rows with all the non-spilled build
   // partitions. Unpin this now to free this memory for repartitioning.
   if (null_probe_rows_ != NULL) {
-    null_probe_rows_->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL_EXCEPT_CURRENT);
+    null_probe_rows_->UnpinStream(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT);
   }
 
   builder_->ClearHashPartitions();
@@ -1170,7 +1170,7 @@ string PartitionedHashJoinNode::NodeDebugString() const {
     ss << "  Probe hash partition " << i << ": ";
     if (probe_partition != NULL) {
       ss << "probe ptr=" << probe_partition;
-      BufferedTupleStreamV2* probe_rows = probe_partition->probe_rows();
+      BufferedTupleStream* probe_rows = probe_partition->probe_rows();
       if (probe_rows != NULL) {
         ss << "    Probe Rows: " << probe_rows->num_rows()
            << "    (Bytes pinned: " << probe_rows->BytesPinned(false) << ")";

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/exec/partitioned-hash-join-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-node.h b/be/src/exec/partitioned-hash-join-node.h
index b3f663e..6ed5269 100644
--- a/be/src/exec/partitioned-hash-join-node.h
+++ b/be/src/exec/partitioned-hash-join-node.h
@@ -162,7 +162,7 @@ class PartitionedHashJoinNode : public BlockingJoinNode {
   /// Creates an initialized probe partition at 'partition_idx' in
   /// 'probe_hash_partitions_'.
   void CreateProbePartition(
-      int partition_idx, std::unique_ptr<BufferedTupleStreamV2> probe_rows);
+      int partition_idx, std::unique_ptr<BufferedTupleStream> probe_rows);
 
   /// Append the probe row 'row' to 'stream'. The stream must be unpinned and must have
   /// a write buffer allocated, so this will succeed unless an error is encountered.
@@ -170,7 +170,7 @@ class PartitionedHashJoinNode : public BlockingJoinNode {
   /// return convention is used to avoid emitting unnecessary code for ~Status in perf-
   /// critical code.
   bool AppendProbeRow(
-      BufferedTupleStreamV2* stream, TupleRow* row, Status* status) WARN_UNUSED_RESULT;
+      BufferedTupleStream* stream, TupleRow* row, Status* status) WARN_UNUSED_RESULT;
 
   /// Probes the hash table for rows matching the current probe row and appends
   /// all the matching build rows (with probe row) to output batch. Returns true
@@ -325,7 +325,7 @@ class PartitionedHashJoinNode : public BlockingJoinNode {
   /// conjuncts pass (i.e. there is a match).
   /// This is used for NAAJ, when there are NULL probe rows.
   Status EvaluateNullProbe(
-      RuntimeState* state, BufferedTupleStreamV2* build) WARN_UNUSED_RESULT;
+      RuntimeState* state, BufferedTupleStream* build) WARN_UNUSED_RESULT;
 
   /// Prepares to output NULLs on the probe side for NAAJ. Before calling this,
   /// matched_null_probe_ should have been fully evaluated.
@@ -472,7 +472,7 @@ class PartitionedHashJoinNode : public BlockingJoinNode {
 
   /// For NAAJ, this stream contains all probe rows that had NULL on the hash table
   /// conjuncts. Must be unique_ptr so we can release it and transfer to output batches.
-  std::unique_ptr<BufferedTupleStreamV2> null_probe_rows_;
+  std::unique_ptr<BufferedTupleStream> null_probe_rows_;
 
   /// For each row in null_probe_rows_, true if this row has matched any build row
   /// (i.e. the resulting joined row passes other_join_conjuncts).
@@ -504,7 +504,7 @@ class PartitionedHashJoinNode : public BlockingJoinNode {
     /// that has been prepared for writing with an I/O-sized write buffer.
     ProbePartition(RuntimeState* state, PartitionedHashJoinNode* parent,
         PhjBuilder::Partition* build_partition,
-        std::unique_ptr<BufferedTupleStreamV2> probe_rows);
+        std::unique_ptr<BufferedTupleStream> probe_rows);
     ~ProbePartition();
 
     /// Prepare to read the probe rows. Allocates the first read block, so reads will
@@ -517,7 +517,7 @@ class PartitionedHashJoinNode : public BlockingJoinNode {
     /// resources if 'batch' is NULL. Idempotent.
     void Close(RowBatch* batch);
 
-    BufferedTupleStreamV2* ALWAYS_INLINE probe_rows() { return probe_rows_.get(); }
+    BufferedTupleStream* ALWAYS_INLINE probe_rows() { return probe_rows_.get(); }
     PhjBuilder::Partition* build_partition() { return build_partition_; }
 
     inline bool IsClosed() const { return probe_rows_ == NULL; }
@@ -529,7 +529,7 @@ class PartitionedHashJoinNode : public BlockingJoinNode {
     /// Stream of probe tuples in this partition. Initially owned by this object but
     /// transferred to the parent exec node (via the row batch) when the partition
     /// is complete. If NULL, ownership was transferred and the partition is closed.
-    std::unique_ptr<BufferedTupleStreamV2> probe_rows_;
+    std::unique_ptr<BufferedTupleStream> probe_rows_;
   };
 
   /// For the below codegen'd functions, xxx_fn_level0_ uses CRC hashing when available

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/exec/partitioned-hash-join-node.inline.h
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-node.inline.h b/be/src/exec/partitioned-hash-join-node.inline.h
index 3441aac..a53b40e 100644
--- a/be/src/exec/partitioned-hash-join-node.inline.h
+++ b/be/src/exec/partitioned-hash-join-node.inline.h
@@ -20,7 +20,7 @@
 
 #include "exec/partitioned-hash-join-node.h"
 
-#include "runtime/buffered-tuple-stream-v2.inline.h"
+#include "runtime/buffered-tuple-stream.inline.h"
 
 namespace impala {
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/runtime/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/be/src/runtime/CMakeLists.txt b/be/src/runtime/CMakeLists.txt
index 92af968..391fd01 100644
--- a/be/src/runtime/CMakeLists.txt
+++ b/be/src/runtime/CMakeLists.txt
@@ -24,7 +24,7 @@ set(LIBRARY_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}/runtime")
 set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}/runtime")
 
 add_library(Runtime
-  buffered-tuple-stream-v2.cc
+  buffered-tuple-stream.cc
   client-cache.cc
   coordinator.cc
   coordinator-backend-state.cc
@@ -91,7 +91,7 @@ ADD_BE_TEST(thread-resource-mgr-test)
 ADD_BE_TEST(mem-tracker-test)
 ADD_BE_TEST(multi-precision-test)
 ADD_BE_TEST(decimal-test)
-ADD_BE_TEST(buffered-tuple-stream-v2-test)
+ADD_BE_TEST(buffered-tuple-stream-test)
 ADD_BE_TEST(hdfs-fs-cache-test)
 ADD_BE_TEST(tmp-file-mgr-test)
 ADD_BE_TEST(row-batch-serialize-test)


[05/11] incubator-impala git commit: IMPALA-4674: Part 2.5: Rename BufferedTupleStreamV2

Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/runtime/buffered-tuple-stream-v2.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-tuple-stream-v2.cc b/be/src/runtime/buffered-tuple-stream-v2.cc
deleted file mode 100644
index 2153264..0000000
--- a/be/src/runtime/buffered-tuple-stream-v2.cc
+++ /dev/null
@@ -1,1084 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/buffered-tuple-stream-v2.inline.h"
-
-#include <boost/bind.hpp>
-#include <gutil/strings/substitute.h>
-
-#include "runtime/bufferpool/reservation-tracker.h"
-#include "runtime/collection-value.h"
-#include "runtime/descriptors.h"
-#include "runtime/exec-env.h"
-#include "runtime/mem-tracker.h"
-#include "runtime/row-batch.h"
-#include "runtime/runtime-state.h"
-#include "runtime/string-value.h"
-#include "runtime/tuple-row.h"
-#include "util/bit-util.h"
-#include "util/debug-util.h"
-#include "util/runtime-profile-counters.h"
-
-#include "common/names.h"
-
-#ifdef NDEBUG
-#define CHECK_CONSISTENCY_FAST()
-#define CHECK_CONSISTENCY_FULL()
-#else
-#define CHECK_CONSISTENCY_FAST() CheckConsistencyFast()
-#define CHECK_CONSISTENCY_FULL() CheckConsistencyFull()
-#endif
-
-using namespace impala;
-using namespace strings;
-
-using BufferHandle = BufferPool::BufferHandle;
-
-BufferedTupleStreamV2::BufferedTupleStreamV2(RuntimeState* state,
-    const RowDescriptor* row_desc, BufferPool::ClientHandle* buffer_pool_client,
-    int64_t default_page_len, int64_t max_page_len, const set<SlotId>& ext_varlen_slots)
-  : state_(state),
-    desc_(row_desc),
-    node_id_(-1),
-    buffer_pool_(state->exec_env()->buffer_pool()),
-    buffer_pool_client_(buffer_pool_client),
-    num_pages_(0),
-    total_byte_size_(0),
-    has_read_iterator_(false),
-    read_page_reservation_(buffer_pool_client_),
-    read_page_rows_returned_(-1),
-    read_ptr_(nullptr),
-    read_end_ptr_(nullptr),
-    write_ptr_(nullptr),
-    write_end_ptr_(nullptr),
-    rows_returned_(0),
-    has_write_iterator_(false),
-    write_page_(nullptr),
-    write_page_reservation_(buffer_pool_client_),
-    bytes_pinned_(0),
-    num_rows_(0),
-    default_page_len_(default_page_len),
-    max_page_len_(max_page_len),
-    has_nullable_tuple_(row_desc->IsAnyTupleNullable()),
-    delete_on_read_(false),
-    closed_(false),
-    pinned_(true) {
-  DCHECK_GE(max_page_len, default_page_len);
-  DCHECK(BitUtil::IsPowerOf2(default_page_len)) << default_page_len;
-  DCHECK(BitUtil::IsPowerOf2(max_page_len)) << max_page_len;
-  read_page_ = pages_.end();
-  for (int i = 0; i < desc_->tuple_descriptors().size(); ++i) {
-    const TupleDescriptor* tuple_desc = desc_->tuple_descriptors()[i];
-    const int tuple_byte_size = tuple_desc->byte_size();
-    fixed_tuple_sizes_.push_back(tuple_byte_size);
-
-    vector<SlotDescriptor*> tuple_string_slots;
-    vector<SlotDescriptor*> tuple_coll_slots;
-    for (int j = 0; j < tuple_desc->slots().size(); ++j) {
-      SlotDescriptor* slot = tuple_desc->slots()[j];
-      if (!slot->type().IsVarLenType()) continue;
-      if (ext_varlen_slots.find(slot->id()) == ext_varlen_slots.end()) {
-        if (slot->type().IsVarLenStringType()) {
-          tuple_string_slots.push_back(slot);
-        } else {
-          DCHECK(slot->type().IsCollectionType());
-          tuple_coll_slots.push_back(slot);
-        }
-      }
-    }
-    if (!tuple_string_slots.empty()) {
-      inlined_string_slots_.push_back(make_pair(i, tuple_string_slots));
-    }
-
-    if (!tuple_coll_slots.empty()) {
-      inlined_coll_slots_.push_back(make_pair(i, tuple_coll_slots));
-    }
-  }
-}
-
-BufferedTupleStreamV2::~BufferedTupleStreamV2() {
-  DCHECK(closed_);
-}
-
-void BufferedTupleStreamV2::CheckConsistencyFull() const {
-  CheckConsistencyFast();
-  // The below checks require iterating over all the pages in the stream.
-  DCHECK_EQ(bytes_pinned_, CalcBytesPinned()) << DebugString();
-  DCHECK_EQ(pages_.size(), num_pages_) << DebugString();
-  for (const Page& page : pages_) CheckPageConsistency(&page);
-}
-
-void BufferedTupleStreamV2::CheckConsistencyFast() const {
-  // All the below checks should be O(1).
-  DCHECK(has_write_iterator() || write_page_ == nullptr);
-  if (write_page_ != nullptr) {
-    CheckPageConsistency(write_page_);
-    DCHECK(write_page_->is_pinned());
-    DCHECK(write_page_->retrieved_buffer);
-    const BufferHandle* write_buffer;
-    Status status = write_page_->GetBuffer(&write_buffer);
-    DCHECK(status.ok()); // Write buffer should never have been unpinned.
-    DCHECK_GE(write_ptr_, write_buffer->data());
-    DCHECK_EQ(write_end_ptr_, write_buffer->data() + write_page_->len());
-    DCHECK_GE(write_end_ptr_, write_ptr_);
-  }
-  DCHECK(has_read_iterator() || read_page_ == pages_.end());
-  if (read_page_ != pages_.end()) {
-    CheckPageConsistency(&*read_page_);
-    DCHECK(read_page_->is_pinned());
-    DCHECK(read_page_->retrieved_buffer);
-    // Can't check read buffer without affecting behaviour, because a read may be in
-    // flight and this would required blocking on that write.
-    DCHECK_GE(read_end_ptr_, read_ptr_);
-  }
-  if (NeedReadReservation()) {
-    DCHECK_EQ(default_page_len_, read_page_reservation_.GetReservation())
-        << DebugString();
-  } else if (!read_page_reservation_.is_closed()) {
-    DCHECK_EQ(0, read_page_reservation_.GetReservation());
-  }
-  if (NeedWriteReservation()) {
-    DCHECK_EQ(default_page_len_, write_page_reservation_.GetReservation());
-  } else if (!write_page_reservation_.is_closed()) {
-    DCHECK_EQ(0, write_page_reservation_.GetReservation());
-  }
-}
-
-void BufferedTupleStreamV2::CheckPageConsistency(const Page* page) const {
-  DCHECK_EQ(ExpectedPinCount(pinned_, page), page->pin_count()) << DebugString();
-  // Only one large row per page.
-  if (page->len() > default_page_len_) DCHECK_LE(page->num_rows, 1);
-  // We only create pages when we have a row to append to them.
-  DCHECK_GT(page->num_rows, 0);
-}
-
-string BufferedTupleStreamV2::DebugString() const {
-  stringstream ss;
-  ss << "BufferedTupleStreamV2 num_rows=" << num_rows_
-     << " rows_returned=" << rows_returned_ << " pinned=" << pinned_
-     << " delete_on_read=" << delete_on_read_ << " closed=" << closed_ << "\n"
-     << " bytes_pinned=" << bytes_pinned_ << " has_write_iterator=" << has_write_iterator_
-     << " write_page=" << write_page_ << " has_read_iterator=" << has_read_iterator_
-     << " read_page=";
-  if (read_page_ == pages_.end()) {
-    ss << "<end>";
-  } else {
-    ss << &*read_page_;
-  }
-  ss << "\n"
-     << " read_page_reservation=";
-  if (read_page_reservation_.is_closed()) {
-    ss << "<closed>";
-  } else {
-    ss << read_page_reservation_.GetReservation();
-  }
-  ss << " write_page_reservation=";
-  if (write_page_reservation_.is_closed()) {
-    ss << "<closed>";
-  } else {
-    ss << write_page_reservation_.GetReservation();
-  }
-  ss << "\n # pages=" << num_pages_ << " pages=[\n";
-  for (const Page& page : pages_) {
-    ss << "{" << page.DebugString() << "}";
-    if (&page != &pages_.back()) ss << ",\n";
-  }
-  ss << "]";
-  return ss.str();
-}
-
-string BufferedTupleStreamV2::Page::DebugString() const {
-  return Substitute("$0 num_rows=$1", handle.DebugString(), num_rows);
-}
-
-Status BufferedTupleStreamV2::Init(int node_id, bool pinned) {
-  if (!pinned) UnpinStream(UNPIN_ALL_EXCEPT_CURRENT);
-  node_id_ = node_id;
-  return Status::OK();
-}
-
-Status BufferedTupleStreamV2::PrepareForWrite(bool* got_reservation) {
-  // This must be the first iterator created.
-  DCHECK(pages_.empty());
-  DCHECK(!delete_on_read_);
-  DCHECK(!has_write_iterator());
-  DCHECK(!has_read_iterator());
-  CHECK_CONSISTENCY_FULL();
-
-  *got_reservation = buffer_pool_client_->IncreaseReservationToFit(default_page_len_);
-  if (!*got_reservation) return Status::OK();
-  has_write_iterator_ = true;
-  // Save reservation for the write iterators.
-  buffer_pool_client_->SaveReservation(&write_page_reservation_, default_page_len_);
-  CHECK_CONSISTENCY_FULL();
-  return Status::OK();
-}
-
-Status BufferedTupleStreamV2::PrepareForReadWrite(
-    bool delete_on_read, bool* got_reservation) {
-  // This must be the first iterator created.
-  DCHECK(pages_.empty());
-  DCHECK(!delete_on_read_);
-  DCHECK(!has_write_iterator());
-  DCHECK(!has_read_iterator());
-  CHECK_CONSISTENCY_FULL();
-
-  *got_reservation = buffer_pool_client_->IncreaseReservationToFit(2 * default_page_len_);
-  if (!*got_reservation) return Status::OK();
-  has_write_iterator_ = true;
-  // Save reservation for both the read and write iterators.
-  buffer_pool_client_->SaveReservation(&read_page_reservation_, default_page_len_);
-  buffer_pool_client_->SaveReservation(&write_page_reservation_, default_page_len_);
-  RETURN_IF_ERROR(PrepareForReadInternal(delete_on_read));
-  return Status::OK();
-}
-
-void BufferedTupleStreamV2::Close(RowBatch* batch, RowBatch::FlushMode flush) {
-  for (Page& page : pages_) {
-    if (batch != nullptr && page.retrieved_buffer) {
-      // Subtle: We only need to attach buffers from pages that we may have returned
-      // references to. ExtractBuffer() cannot fail for these pages because the data
-      // is guaranteed to already be in -memory.
-      BufferPool::BufferHandle buffer;
-      Status status = buffer_pool_->ExtractBuffer(buffer_pool_client_, &page.handle, &buffer);
-      DCHECK(status.ok());
-      batch->AddBuffer(buffer_pool_client_, move(buffer), flush);
-    } else {
-      buffer_pool_->DestroyPage(buffer_pool_client_, &page.handle);
-    }
-  }
-  read_page_reservation_.Close();
-  write_page_reservation_.Close();
-  pages_.clear();
-  num_pages_ = 0;
-  bytes_pinned_ = 0;
-  closed_ = true;
-}
-
-int64_t BufferedTupleStreamV2::CalcBytesPinned() const {
-  int64_t result = 0;
-  for (const Page& page : pages_) result += page.pin_count() * page.len();
-  return result;
-}
-
-Status BufferedTupleStreamV2::PinPage(Page* page) {
-  RETURN_IF_ERROR(buffer_pool_->Pin(buffer_pool_client_, &page->handle));
-  bytes_pinned_ += page->len();
-  return Status::OK();
-}
-
-int BufferedTupleStreamV2::ExpectedPinCount(bool stream_pinned, const Page* page) const {
-  return (stream_pinned || is_read_page(page) || is_write_page(page)) ? 1 : 0;
-}
-
-Status BufferedTupleStreamV2::PinPageIfNeeded(Page* page, bool stream_pinned) {
-  int new_pin_count = ExpectedPinCount(stream_pinned, page);
-  if (new_pin_count != page->pin_count()) {
-    DCHECK_EQ(new_pin_count, page->pin_count() + 1);
-    RETURN_IF_ERROR(PinPage(page));
-  }
-  return Status::OK();
-}
-
-void BufferedTupleStreamV2::UnpinPageIfNeeded(Page* page, bool stream_pinned) {
-  int new_pin_count = ExpectedPinCount(stream_pinned, page);
-  if (new_pin_count != page->pin_count()) {
-    DCHECK_EQ(new_pin_count, page->pin_count() - 1);
-    buffer_pool_->Unpin(buffer_pool_client_, &page->handle);
-    bytes_pinned_ -= page->len();
-    if (page->pin_count() == 0) page->retrieved_buffer = false;
-  }
-}
-
-bool BufferedTupleStreamV2::NeedWriteReservation() const {
-  return NeedWriteReservation(pinned_);
-}
-
-bool BufferedTupleStreamV2::NeedWriteReservation(bool stream_pinned) const {
-  return NeedWriteReservation(stream_pinned, num_pages_, has_write_iterator(),
-      write_page_ != nullptr, has_read_write_page());
-}
-
-bool BufferedTupleStreamV2::NeedWriteReservation(bool stream_pinned, int64_t num_pages,
-    bool has_write_iterator, bool has_write_page, bool has_read_write_page) {
-  if (!has_write_iterator) return false;
-  // If the stream is empty the write reservation hasn't been used yet.
-  if (num_pages == 0) return true;
-  if (stream_pinned) {
-    // Make sure we've saved the write reservation for the next page if the only
-    // page is a read/write page.
-    return has_read_write_page && num_pages == 1;
-  } else {
-    // Make sure we've saved the write reservation if it's not being used to pin
-    // a page in the stream.
-    return !has_write_page || has_read_write_page;
-  }
-}
-
-bool BufferedTupleStreamV2::NeedReadReservation() const {
-  return NeedReadReservation(pinned_);
-}
-
-bool BufferedTupleStreamV2::NeedReadReservation(bool stream_pinned) const {
-  return NeedReadReservation(
-      stream_pinned, num_pages_, has_read_iterator(), read_page_ != pages_.end());
-}
-
-bool BufferedTupleStreamV2::NeedReadReservation(bool stream_pinned, int64_t num_pages,
-    bool has_read_iterator, bool has_read_page) const {
-  return NeedReadReservation(stream_pinned, num_pages, has_read_iterator, has_read_page,
-      has_write_iterator(), write_page_ != nullptr);
-}
-
-bool BufferedTupleStreamV2::NeedReadReservation(bool stream_pinned, int64_t num_pages,
-    bool has_read_iterator, bool has_read_page, bool has_write_iterator,
-    bool has_write_page) {
-  if (!has_read_iterator) return false;
-  if (stream_pinned) {
-    // Need reservation if there are no pages currently pinned for reading but we may add
-    // a page.
-    return num_pages == 0 && has_write_iterator;
-  } else {
-    // Only need to save reservation for an unpinned stream if there is no read page
-    // and we may advance to one in the future.
-    return (has_write_iterator || num_pages > 0) && !has_read_page;
-  }
-}
-
-Status BufferedTupleStreamV2::NewWritePage(int64_t page_len) noexcept {
-  DCHECK(!closed_);
-  DCHECK(write_page_ == nullptr);
-
-  Page new_page;
-  const BufferHandle* write_buffer;
-  RETURN_IF_ERROR(buffer_pool_->CreatePage(
-      buffer_pool_client_, page_len, &new_page.handle, &write_buffer));
-  bytes_pinned_ += page_len;
-  total_byte_size_ += page_len;
-
-  pages_.push_back(std::move(new_page));
-  ++num_pages_;
-  write_page_ = &pages_.back();
-  DCHECK_EQ(write_page_->num_rows, 0);
-  write_ptr_ = write_buffer->data();
-  write_end_ptr_ = write_ptr_ + page_len;
-  return Status::OK();
-}
-
-Status BufferedTupleStreamV2::CalcPageLenForRow(int64_t row_size, int64_t* page_len) {
-  if (UNLIKELY(row_size > max_page_len_)) {
-    return Status(TErrorCode::MAX_ROW_SIZE,
-        PrettyPrinter::Print(row_size, TUnit::BYTES), node_id_,
-        PrettyPrinter::Print(0, TUnit::BYTES));
-  }
-  *page_len = max(default_page_len_, BitUtil::RoundUpToPowerOfTwo(row_size));
-  return Status::OK();
-}
-
-Status BufferedTupleStreamV2::AdvanceWritePage(
-    int64_t row_size, bool* got_reservation) noexcept {
-  DCHECK(has_write_iterator());
-  CHECK_CONSISTENCY_FAST();
-
-  int64_t page_len;
-  RETURN_IF_ERROR(CalcPageLenForRow(row_size, &page_len));
-
-  // Reservation may have been saved for the next write page, e.g. by PrepareForWrite()
-  // if the stream is empty.
-  int64_t write_reservation_to_restore = 0, read_reservation_to_restore = 0;
-  if (NeedWriteReservation(
-          pinned_, num_pages_, true, write_page_ != nullptr, has_read_write_page())
-      && !NeedWriteReservation(pinned_, num_pages_ + 1, true, true, false)) {
-    write_reservation_to_restore = default_page_len_;
-  }
-  // If the stream is pinned, we need to keep the previous write page pinned for reading.
-  // Check if we saved reservation for this case.
-  if (NeedReadReservation(pinned_, num_pages_, has_read_iterator(),
-          read_page_ != pages_.end(), true, write_page_ != nullptr)
-      && !NeedReadReservation(pinned_, num_pages_ + 1, has_read_iterator(),
-             read_page_ != pages_.end(), true, true)) {
-    read_reservation_to_restore = default_page_len_;
-  }
-
-  // We may reclaim reservation by unpinning a page that was pinned for writing.
-  int64_t write_page_reservation_to_reclaim =
-      (write_page_ != nullptr && !pinned_ && !has_read_write_page()) ?
-      write_page_->len() : 0;
-  // Check to see if we can get the reservation before changing the state of the stream.
-  if (!buffer_pool_client_->IncreaseReservationToFit(page_len
-          - write_reservation_to_restore - read_reservation_to_restore
-          - write_page_reservation_to_reclaim)) {
-    DCHECK(pinned_ || page_len > default_page_len_)
-        << "If the stream is unpinned, this should only fail for large pages";
-    CHECK_CONSISTENCY_FAST();
-    *got_reservation = false;
-    return Status::OK();
-  }
-  if (write_reservation_to_restore > 0) {
-    buffer_pool_client_->RestoreReservation(
-        &write_page_reservation_, write_reservation_to_restore);
-  }
-  if (read_reservation_to_restore > 0) {
-    buffer_pool_client_->RestoreReservation(
-        &read_page_reservation_, read_reservation_to_restore);
-  }
-  ResetWritePage();
-  RETURN_IF_ERROR(NewWritePage(page_len));
-  *got_reservation = true;
-  return Status::OK();
-}
-
-void BufferedTupleStreamV2::ResetWritePage() {
-  if (write_page_ == nullptr) return;
-  // Unpin the write page if we're reading in unpinned mode.
-  Page* prev_write_page = write_page_;
-  write_page_ = nullptr;
-  write_ptr_ = nullptr;
-  write_end_ptr_ = nullptr;
-
-  // May need to decrement pin count now that it's not the write page, depending on
-  // the stream's mode.
-  UnpinPageIfNeeded(prev_write_page, pinned_);
-}
-
-void BufferedTupleStreamV2::InvalidateWriteIterator() {
-  if (!has_write_iterator()) return;
-  ResetWritePage();
-  has_write_iterator_ = false;
-  // No more pages will be appended to stream - do not need any write reservation.
-  write_page_reservation_.Close();
-  // May not need a read reservation once the write iterator is invalidated.
-  if (NeedReadReservation(pinned_, num_pages_, has_read_iterator(),
-          read_page_ != pages_.end(), true, write_page_ != nullptr)
-      && !NeedReadReservation(pinned_, num_pages_, has_read_iterator(),
-             read_page_ != pages_.end(), false, false)) {
-    buffer_pool_client_->RestoreReservation(&read_page_reservation_, default_page_len_);
-  }
-}
-
-Status BufferedTupleStreamV2::NextReadPage() {
-  DCHECK(has_read_iterator());
-  DCHECK(!closed_);
-  CHECK_CONSISTENCY_FAST();
-
-  if (read_page_ == pages_.end()) {
-    // No rows read yet - start reading at first page. If the stream is unpinned, we can
-    // use the reservation saved in PrepareForReadWrite() to pin the first page.
-    read_page_ = pages_.begin();
-    if (NeedReadReservation(pinned_, num_pages_, true, false)
-        && !NeedReadReservation(pinned_, num_pages_, true, true)) {
-      buffer_pool_client_->RestoreReservation(&read_page_reservation_, default_page_len_);
-    }
-  } else if (delete_on_read_) {
-    DCHECK(read_page_ == pages_.begin()) << read_page_->DebugString() << " "
-                                         << DebugString();
-    DCHECK_NE(&*read_page_, write_page_);
-    bytes_pinned_ -= pages_.front().len();
-    buffer_pool_->DestroyPage(buffer_pool_client_, &pages_.front().handle);
-    pages_.pop_front();
-    --num_pages_;
-    read_page_ = pages_.begin();
-  } else {
-    // Unpin pages after reading them if needed.
-    Page* prev_read_page = &*read_page_;
-    ++read_page_;
-    UnpinPageIfNeeded(prev_read_page, pinned_);
-  }
-
-  if (read_page_ == pages_.end()) {
-    CHECK_CONSISTENCY_FULL();
-    return Status::OK();
-  }
-
-  if (!pinned_ && read_page_->len() > default_page_len_
-      && buffer_pool_client_->GetUnusedReservation() < read_page_->len()) {
-    // If we are iterating over an unpinned stream and encounter a page that is larger
-    // than the default page length, then unpinning the previous page may not have
-    // freed up enough reservation to pin the next one. The client is responsible for
-    // ensuring the reservation is available, so this indicates a bug.
-    return Status(TErrorCode::INTERNAL_ERROR, Substitute("Internal error: couldn't pin "
-          "large page of $0 bytes, client only had $1 bytes of unused reservation:\n$2",
-          read_page_->len(), buffer_pool_client_->GetUnusedReservation(),
-          buffer_pool_client_->DebugString()));
-  }
-  // Ensure the next page is pinned for reading. By this point we should have enough
-  // reservation to pin the page. If the stream is pinned, the page is already pinned.
-  // If the stream is unpinned, we freed up enough memory for a default-sized page by
-  // deleting or unpinning the previous page and ensured that, if the page was larger,
-  // that the reservation is available with the above check.
-  RETURN_IF_ERROR(PinPageIfNeeded(&*read_page_, pinned_));
-
-  // This waits for the pin to complete if the page was unpinned earlier.
-  const BufferHandle* read_buffer;
-  RETURN_IF_ERROR(read_page_->GetBuffer(&read_buffer));
-
-  read_page_rows_returned_ = 0;
-  read_ptr_ = read_buffer->data();
-  read_end_ptr_ = read_ptr_ + read_buffer->len();
-
-  // We may need to save reservation for the write page in the case when the write page
-  // became a read/write page.
-  if (!NeedWriteReservation(pinned_, num_pages_, has_write_iterator(),
-             write_page_ != nullptr, false)
-      && NeedWriteReservation(pinned_, num_pages_, has_write_iterator(),
-             write_page_ != nullptr, has_read_write_page())) {
-    buffer_pool_client_->SaveReservation(&write_page_reservation_, default_page_len_);
-  }
-  CHECK_CONSISTENCY_FAST();
-  return Status::OK();
-}
-
-void BufferedTupleStreamV2::InvalidateReadIterator() {
-  if (read_page_ != pages_.end()) {
-    // Unpin the write page if we're reading in unpinned mode.
-    Page* prev_read_page = &*read_page_;
-    read_page_ = pages_.end();
-    read_ptr_ = nullptr;
-    read_end_ptr_ = nullptr;
-
-    // May need to decrement pin count after destroying read iterator.
-    UnpinPageIfNeeded(prev_read_page, pinned_);
-  }
-  has_read_iterator_ = false;
-  if (read_page_reservation_.GetReservation() > 0) {
-    buffer_pool_client_->RestoreReservation(&read_page_reservation_, default_page_len_);
-  }
-  // It is safe to re-read a delete-on-read stream if no rows were read and no pages
-  // were therefore deleted.
-  if (rows_returned_ == 0) delete_on_read_ = false;
-}
-
-Status BufferedTupleStreamV2::PrepareForRead(bool delete_on_read, bool* got_reservation) {
-  CHECK_CONSISTENCY_FULL();
-  InvalidateWriteIterator();
-  InvalidateReadIterator();
-  // If already pinned, no additional pin is needed (see ExpectedPinCount()).
-  *got_reservation = pinned_ || pages_.empty()
-      || buffer_pool_client_->IncreaseReservationToFit(default_page_len_);
-  if (!*got_reservation) return Status::OK();
-  return PrepareForReadInternal(delete_on_read);
-}
-
-Status BufferedTupleStreamV2::PrepareForReadInternal(bool delete_on_read) {
-  DCHECK(!closed_);
-  DCHECK(!delete_on_read_);
-  DCHECK(!has_read_iterator());
-
-  has_read_iterator_ = true;
-  if (pages_.empty()) {
-    // No rows to return, or a the first read/write page has not yet been allocated.
-    read_page_ = pages_.end();
-    read_ptr_ = nullptr;
-    read_end_ptr_ = nullptr;
-  } else {
-    // Eagerly pin the first page in the stream.
-    read_page_ = pages_.begin();
-    // Check if we need to increment the pin count of the read page.
-    RETURN_IF_ERROR(PinPageIfNeeded(&*read_page_, pinned_));
-    DCHECK(read_page_->is_pinned());
-
-    // This waits for the pin to complete if the page was unpinned earlier.
-    const BufferHandle* read_buffer;
-    RETURN_IF_ERROR(read_page_->GetBuffer(&read_buffer));
-    read_ptr_ = read_buffer->data();
-    read_end_ptr_ = read_ptr_ + read_buffer->len();
-  }
-  read_page_rows_returned_ = 0;
-  rows_returned_ = 0;
-  delete_on_read_ = delete_on_read;
-  CHECK_CONSISTENCY_FULL();
-  return Status::OK();
-}
-
-Status BufferedTupleStreamV2::PinStream(bool* pinned) {
-  DCHECK(!closed_);
-  CHECK_CONSISTENCY_FULL();
-  if (pinned_) {
-    *pinned = true;
-    return Status::OK();
-  }
-  *pinned = false;
-  // First, make sure we have the reservation to pin all the pages for reading.
-  int64_t bytes_to_pin = 0;
-  for (Page& page : pages_) {
-    bytes_to_pin += (ExpectedPinCount(true, &page) - page.pin_count()) * page.len();
-  }
-
-  // Check if we have some reservation to restore.
-  bool restore_write_reservation =
-      NeedWriteReservation(false) && !NeedWriteReservation(true);
-  bool restore_read_reservation =
-      NeedReadReservation(false) && !NeedReadReservation(true);
-  int64_t increase_needed = bytes_to_pin
-      - (restore_write_reservation ? default_page_len_ : 0)
-      - (restore_read_reservation ? default_page_len_ : 0);
-  bool reservation_granted =
-      buffer_pool_client_->IncreaseReservationToFit(increase_needed);
-  if (!reservation_granted) return Status::OK();
-
-  // If there is no current write page we should have some saved reservation to use.
-  // Only continue saving it if the stream is empty and need it to pin the first page.
-  if (restore_write_reservation) {
-    buffer_pool_client_->RestoreReservation(&write_page_reservation_, default_page_len_);
-  }
-  if (restore_read_reservation) {
-    buffer_pool_client_->RestoreReservation(&read_page_reservation_, default_page_len_);
-  }
-
-  // At this point success is guaranteed - go through to pin the pages we need to pin.
-  // If the page data was evicted from memory, the read I/O can happen in parallel
-  // because we defer calling GetBuffer() until NextReadPage().
-  for (Page& page : pages_) RETURN_IF_ERROR(PinPageIfNeeded(&page, true));
-
-  pinned_ = true;
-  *pinned = true;
-  CHECK_CONSISTENCY_FULL();
-  return Status::OK();
-}
-
-void BufferedTupleStreamV2::UnpinStream(UnpinMode mode) {
-  CHECK_CONSISTENCY_FULL();
-  DCHECK(!closed_);
-  if (mode == UNPIN_ALL) {
-    // Invalidate the iterators so they don't keep pages pinned.
-    InvalidateWriteIterator();
-    InvalidateReadIterator();
-  }
-
-  if (pinned_) {
-    CHECK_CONSISTENCY_FULL();
-    // If the stream was pinned, there may be some remaining pinned pages that should
-    // be unpinned at this point.
-    for (Page& page : pages_) UnpinPageIfNeeded(&page, false);
-
-    // Check to see if we need to save some of the reservation we freed up.
-    if (!NeedWriteReservation(true) && NeedWriteReservation(false)) {
-      buffer_pool_client_->SaveReservation(&write_page_reservation_, default_page_len_);
-    }
-    if (!NeedReadReservation(true) && NeedReadReservation(false)) {
-      buffer_pool_client_->SaveReservation(&read_page_reservation_, default_page_len_);
-    }
-    pinned_ = false;
-  }
-  CHECK_CONSISTENCY_FULL();
-}
-
-Status BufferedTupleStreamV2::GetRows(
-    MemTracker* tracker, scoped_ptr<RowBatch>* batch, bool* got_rows) {
-  if (num_rows() > numeric_limits<int>::max()) {
-    // RowBatch::num_rows_ is a 32-bit int, avoid an overflow.
-    return Status(Substitute("Trying to read $0 rows into in-memory batch failed. Limit "
-                             "is $1",
-        num_rows(), numeric_limits<int>::max()));
-  }
-  RETURN_IF_ERROR(PinStream(got_rows));
-  if (!*got_rows) return Status::OK();
-  bool got_reservation;
-  RETURN_IF_ERROR(PrepareForRead(false, &got_reservation));
-  DCHECK(got_reservation) << "Stream was pinned";
-  batch->reset(new RowBatch(desc_, num_rows(), tracker));
-  bool eos = false;
-  // Loop until GetNext fills the entire batch. Each call can stop at page
-  // boundaries. We generally want it to stop, so that pages can be freed
-  // as we read. It is safe in this case because we pin the entire stream.
-  while (!eos) {
-    RETURN_IF_ERROR(GetNext(batch->get(), &eos));
-  }
-  return Status::OK();
-}
-
-Status BufferedTupleStreamV2::GetNext(RowBatch* batch, bool* eos) {
-  return GetNextInternal<false>(batch, eos, nullptr);
-}
-
-Status BufferedTupleStreamV2::GetNext(
-    RowBatch* batch, bool* eos, vector<FlatRowPtr>* flat_rows) {
-  return GetNextInternal<true>(batch, eos, flat_rows);
-}
-
-template <bool FILL_FLAT_ROWS>
-Status BufferedTupleStreamV2::GetNextInternal(
-    RowBatch* batch, bool* eos, vector<FlatRowPtr>* flat_rows) {
-  if (has_nullable_tuple_) {
-    return GetNextInternal<FILL_FLAT_ROWS, true>(batch, eos, flat_rows);
-  } else {
-    return GetNextInternal<FILL_FLAT_ROWS, false>(batch, eos, flat_rows);
-  }
-}
-
-template <bool FILL_FLAT_ROWS, bool HAS_NULLABLE_TUPLE>
-Status BufferedTupleStreamV2::GetNextInternal(
-    RowBatch* batch, bool* eos, vector<FlatRowPtr>* flat_rows) {
-  DCHECK(!closed_);
-  DCHECK(batch->row_desc()->Equals(*desc_));
-  DCHECK(is_pinned() || !FILL_FLAT_ROWS)
-      << "FlatRowPtrs are only valid for pinned streams";
-  *eos = (rows_returned_ == num_rows_);
-  if (*eos) return Status::OK();
-
-  if (UNLIKELY(read_page_ == pages_.end()
-          || read_page_rows_returned_ == read_page_->num_rows)) {
-    // Get the next page in the stream (or the first page if read_page_ was not yet
-    // initialized.) We need to do this at the beginning of the GetNext() call to ensure
-    // the buffer management semantics. NextReadPage() may unpin or delete the buffer
-    // backing the rows returned from the *previous* call to GetNext().
-    RETURN_IF_ERROR(NextReadPage());
-  }
-
-  DCHECK(has_read_iterator());
-  DCHECK(read_page_ != pages_.end());
-  DCHECK(read_page_->is_pinned()) << DebugString();
-  DCHECK_GE(read_page_rows_returned_, 0);
-
-  int rows_left_in_page = read_page_->num_rows - read_page_rows_returned_;
-  int rows_to_fill = std::min(batch->capacity() - batch->num_rows(), rows_left_in_page);
-  DCHECK_GE(rows_to_fill, 1);
-  uint8_t* tuple_row_mem = reinterpret_cast<uint8_t*>(batch->GetRow(batch->num_rows()));
-
-  // Produce tuple rows from the current page and the corresponding position on the
-  // null tuple indicator.
-  if (FILL_FLAT_ROWS) {
-    DCHECK(flat_rows != nullptr);
-    DCHECK(!delete_on_read_);
-    DCHECK_EQ(batch->num_rows(), 0);
-    flat_rows->clear();
-    flat_rows->reserve(rows_to_fill);
-  }
-
-  const uint64_t tuples_per_row = desc_->tuple_descriptors().size();
-  // Start reading from the current position in 'read_page_'.
-  for (int i = 0; i < rows_to_fill; ++i) {
-    if (FILL_FLAT_ROWS) {
-      flat_rows->push_back(read_ptr_);
-      DCHECK_EQ(flat_rows->size(), i + 1);
-    }
-    // Copy the row into the output batch.
-    TupleRow* output_row = reinterpret_cast<TupleRow*>(tuple_row_mem);
-    tuple_row_mem += sizeof(Tuple*) * tuples_per_row;
-    UnflattenTupleRow<HAS_NULLABLE_TUPLE>(&read_ptr_, output_row);
-
-    // Update string slot ptrs, skipping external strings.
-    for (int j = 0; j < inlined_string_slots_.size(); ++j) {
-      Tuple* tuple = output_row->GetTuple(inlined_string_slots_[j].first);
-      if (HAS_NULLABLE_TUPLE && tuple == nullptr) continue;
-      FixUpStringsForRead(inlined_string_slots_[j].second, tuple);
-    }
-
-    // Update collection slot ptrs, skipping external collections. We traverse the
-    // collection structure in the same order as it was written to the stream, allowing
-    // us to infer the data layout based on the length of collections and strings.
-    for (int j = 0; j < inlined_coll_slots_.size(); ++j) {
-      Tuple* tuple = output_row->GetTuple(inlined_coll_slots_[j].first);
-      if (HAS_NULLABLE_TUPLE && tuple == nullptr) continue;
-      FixUpCollectionsForRead(inlined_coll_slots_[j].second, tuple);
-    }
-  }
-
-  batch->CommitRows(rows_to_fill);
-  rows_returned_ += rows_to_fill;
-  read_page_rows_returned_ += rows_to_fill;
-  *eos = (rows_returned_ == num_rows_);
-  if (read_page_rows_returned_ == read_page_->num_rows && (!pinned_ || delete_on_read_)) {
-    // No more data in this page. The batch must be immediately returned up the operator
-    // tree and deep copied so that NextReadPage() can reuse the read page's buffer.
-    // TODO: IMPALA-4179 - instead attach the buffer and flush the resources.
-    batch->MarkNeedsDeepCopy();
-  }
-  if (FILL_FLAT_ROWS) DCHECK_EQ(flat_rows->size(), rows_to_fill);
-  DCHECK_LE(read_ptr_, read_end_ptr_);
-  return Status::OK();
-}
-
-void BufferedTupleStreamV2::FixUpStringsForRead(
-    const vector<SlotDescriptor*>& string_slots, Tuple* tuple) {
-  DCHECK(tuple != nullptr);
-  for (const SlotDescriptor* slot_desc : string_slots) {
-    if (tuple->IsNull(slot_desc->null_indicator_offset())) continue;
-
-    StringValue* sv = tuple->GetStringSlot(slot_desc->tuple_offset());
-    DCHECK_LE(read_ptr_ + sv->len, read_end_ptr_);
-    sv->ptr = reinterpret_cast<char*>(read_ptr_);
-    read_ptr_ += sv->len;
-  }
-}
-
-void BufferedTupleStreamV2::FixUpCollectionsForRead(
-    const vector<SlotDescriptor*>& collection_slots, Tuple* tuple) {
-  DCHECK(tuple != nullptr);
-  for (const SlotDescriptor* slot_desc : collection_slots) {
-    if (tuple->IsNull(slot_desc->null_indicator_offset())) continue;
-
-    CollectionValue* cv = tuple->GetCollectionSlot(slot_desc->tuple_offset());
-    const TupleDescriptor& item_desc = *slot_desc->collection_item_descriptor();
-    int coll_byte_size = cv->num_tuples * item_desc.byte_size();
-    DCHECK_LE(read_ptr_ + coll_byte_size, read_end_ptr_);
-    cv->ptr = reinterpret_cast<uint8_t*>(read_ptr_);
-    read_ptr_ += coll_byte_size;
-
-    if (!item_desc.HasVarlenSlots()) continue;
-    uint8_t* coll_data = cv->ptr;
-    for (int i = 0; i < cv->num_tuples; ++i) {
-      Tuple* item = reinterpret_cast<Tuple*>(coll_data);
-      FixUpStringsForRead(item_desc.string_slots(), item);
-      FixUpCollectionsForRead(item_desc.collection_slots(), item);
-      coll_data += item_desc.byte_size();
-    }
-  }
-}
-
-int64_t BufferedTupleStreamV2::ComputeRowSize(TupleRow* row) const noexcept {
-  int64_t size = 0;
-  if (has_nullable_tuple_) {
-    size += NullIndicatorBytesPerRow();
-    for (int i = 0; i < fixed_tuple_sizes_.size(); ++i) {
-      if (row->GetTuple(i) != nullptr) size += fixed_tuple_sizes_[i];
-    }
-  } else {
-    for (int i = 0; i < fixed_tuple_sizes_.size(); ++i) {
-      size += fixed_tuple_sizes_[i];
-    }
-  }
-  for (int i = 0; i < inlined_string_slots_.size(); ++i) {
-    Tuple* tuple = row->GetTuple(inlined_string_slots_[i].first);
-    if (tuple == nullptr) continue;
-    const vector<SlotDescriptor*>& slots = inlined_string_slots_[i].second;
-    for (auto it = slots.begin(); it != slots.end(); ++it) {
-      if (tuple->IsNull((*it)->null_indicator_offset())) continue;
-      size += tuple->GetStringSlot((*it)->tuple_offset())->len;
-    }
-  }
-
-  for (int i = 0; i < inlined_coll_slots_.size(); ++i) {
-    Tuple* tuple = row->GetTuple(inlined_coll_slots_[i].first);
-    if (tuple == nullptr) continue;
-    const vector<SlotDescriptor*>& slots = inlined_coll_slots_[i].second;
-    for (auto it = slots.begin(); it != slots.end(); ++it) {
-      if (tuple->IsNull((*it)->null_indicator_offset())) continue;
-      CollectionValue* cv = tuple->GetCollectionSlot((*it)->tuple_offset());
-      const TupleDescriptor& item_desc = *(*it)->collection_item_descriptor();
-      size += cv->num_tuples * item_desc.byte_size();
-
-      if (!item_desc.HasVarlenSlots()) continue;
-      for (int j = 0; j < cv->num_tuples; ++j) {
-        Tuple* item = reinterpret_cast<Tuple*>(&cv->ptr[j * item_desc.byte_size()]);
-        size += item->VarlenByteSize(item_desc);
-      }
-    }
-  }
-  return size;
-}
-
-bool BufferedTupleStreamV2::AddRowSlow(TupleRow* row, Status* status) noexcept {
-  // Use AddRowCustom*() to do the work of advancing the page.
-  int64_t row_size = ComputeRowSize(row);
-  uint8_t* data = AddRowCustomBeginSlow(row_size, status);
-  if (data == nullptr) return false;
-  bool success = DeepCopy(row, &data, data + row_size);
-  DCHECK(success);
-  DCHECK_EQ(data, write_ptr_);
-  AddRowCustomEnd(row_size);
-  return true;
-}
-
-uint8_t* BufferedTupleStreamV2::AddRowCustomBeginSlow(
-    int64_t size, Status* status) noexcept {
-  bool got_reservation;
-  *status = AdvanceWritePage(size, &got_reservation);
-  if (!status->ok() || !got_reservation) return nullptr;
-
-  // We have a large-enough page so now success is guaranteed.
-  uint8_t* result = AddRowCustomBegin(size, status);
-  DCHECK(result != nullptr);
-  return result;
-}
-
-void BufferedTupleStreamV2::AddLargeRowCustomEnd(int64_t size) noexcept {
-  DCHECK_GT(size, default_page_len_);
-  // Immediately unpin the large write page so that we're not using up extra reservation
-  // and so we don't append another row to the page.
-  ResetWritePage();
-  // Save some of the reservation we freed up so we can create the next write page when
-  // needed.
-  if (NeedWriteReservation()) {
-    buffer_pool_client_->SaveReservation(&write_page_reservation_, default_page_len_);
-  }
-  // The stream should be in a consistent state once the row is added.
-  CHECK_CONSISTENCY_FAST();
-}
-
-bool BufferedTupleStreamV2::AddRow(TupleRow* row, Status* status) noexcept {
-  DCHECK(!closed_);
-  DCHECK(has_write_iterator());
-  if (UNLIKELY(write_page_ == nullptr || !DeepCopy(row, &write_ptr_, write_end_ptr_))) {
-    return AddRowSlow(row, status);
-  }
-  ++num_rows_;
-  ++write_page_->num_rows;
-  return true;
-}
-
-bool BufferedTupleStreamV2::DeepCopy(
-    TupleRow* row, uint8_t** data, const uint8_t* data_end) noexcept {
-  return has_nullable_tuple_ ? DeepCopyInternal<true>(row, data, data_end) :
-                               DeepCopyInternal<false>(row, data, data_end);
-}
-
-// TODO: consider codegening this.
-// TODO: in case of duplicate tuples, this can redundantly serialize data.
-template <bool HAS_NULLABLE_TUPLE>
-bool BufferedTupleStreamV2::DeepCopyInternal(
-    TupleRow* row, uint8_t** data, const uint8_t* data_end) noexcept {
-  uint8_t* pos = *data;
-  const uint64_t tuples_per_row = desc_->tuple_descriptors().size();
-  // Copy the not NULL fixed len tuples. For the NULL tuples just update the NULL tuple
-  // indicator.
-  if (HAS_NULLABLE_TUPLE) {
-    int null_indicator_bytes = NullIndicatorBytesPerRow();
-    if (UNLIKELY(pos + null_indicator_bytes > data_end)) return false;
-    uint8_t* null_indicators = pos;
-    pos += NullIndicatorBytesPerRow();
-    memset(null_indicators, 0, null_indicator_bytes);
-    for (int i = 0; i < tuples_per_row; ++i) {
-      uint8_t* null_word = null_indicators + (i >> 3);
-      const uint32_t null_pos = i & 7;
-      const int tuple_size = fixed_tuple_sizes_[i];
-      Tuple* t = row->GetTuple(i);
-      const uint8_t mask = 1 << (7 - null_pos);
-      if (t != nullptr) {
-        if (UNLIKELY(pos + tuple_size > data_end)) return false;
-        memcpy(pos, t, tuple_size);
-        pos += tuple_size;
-      } else {
-        *null_word |= mask;
-      }
-    }
-  } else {
-    // If we know that there are no nullable tuples no need to set the nullability flags.
-    for (int i = 0; i < tuples_per_row; ++i) {
-      const int tuple_size = fixed_tuple_sizes_[i];
-      if (UNLIKELY(pos + tuple_size > data_end)) return false;
-      Tuple* t = row->GetTuple(i);
-      // TODO: Once IMPALA-1306 (Avoid passing empty tuples of non-materialized slots)
-      // is delivered, the check below should become DCHECK(t != nullptr).
-      DCHECK(t != nullptr || tuple_size == 0);
-      memcpy(pos, t, tuple_size);
-      pos += tuple_size;
-    }
-  }
-
-  // Copy inlined string slots. Note: we do not need to convert the string ptrs to offsets
-  // on the write path, only on the read. The tuple data is immediately followed
-  // by the string data so only the len information is necessary.
-  for (int i = 0; i < inlined_string_slots_.size(); ++i) {
-    const Tuple* tuple = row->GetTuple(inlined_string_slots_[i].first);
-    if (HAS_NULLABLE_TUPLE && tuple == nullptr) continue;
-    if (UNLIKELY(!CopyStrings(tuple, inlined_string_slots_[i].second, &pos, data_end)))
-      return false;
-  }
-
-  // Copy inlined collection slots. We copy collection data in a well-defined order so
-  // we do not need to convert pointers to offsets on the write path.
-  for (int i = 0; i < inlined_coll_slots_.size(); ++i) {
-    const Tuple* tuple = row->GetTuple(inlined_coll_slots_[i].first);
-    if (HAS_NULLABLE_TUPLE && tuple == nullptr) continue;
-    if (UNLIKELY(!CopyCollections(tuple, inlined_coll_slots_[i].second, &pos, data_end)))
-      return false;
-  }
-  *data = pos;
-  return true;
-}
-
-bool BufferedTupleStreamV2::CopyStrings(const Tuple* tuple,
-    const vector<SlotDescriptor*>& string_slots, uint8_t** data, const uint8_t* data_end) {
-  for (const SlotDescriptor* slot_desc : string_slots) {
-    if (tuple->IsNull(slot_desc->null_indicator_offset())) continue;
-    const StringValue* sv = tuple->GetStringSlot(slot_desc->tuple_offset());
-    if (LIKELY(sv->len > 0)) {
-      if (UNLIKELY(*data + sv->len > data_end)) return false;
-
-      memcpy(*data, sv->ptr, sv->len);
-      *data += sv->len;
-    }
-  }
-  return true;
-}
-
-bool BufferedTupleStreamV2::CopyCollections(const Tuple* tuple,
-    const vector<SlotDescriptor*>& collection_slots, uint8_t** data, const uint8_t* data_end) {
-  for (const SlotDescriptor* slot_desc : collection_slots) {
-    if (tuple->IsNull(slot_desc->null_indicator_offset())) continue;
-    const CollectionValue* cv = tuple->GetCollectionSlot(slot_desc->tuple_offset());
-    const TupleDescriptor& item_desc = *slot_desc->collection_item_descriptor();
-    if (LIKELY(cv->num_tuples > 0)) {
-      int coll_byte_size = cv->num_tuples * item_desc.byte_size();
-      if (UNLIKELY(*data + coll_byte_size > data_end)) return false;
-      uint8_t* coll_data = *data;
-      memcpy(coll_data, cv->ptr, coll_byte_size);
-      *data += coll_byte_size;
-
-      if (!item_desc.HasVarlenSlots()) continue;
-      // Copy variable length data when present in collection items.
-      for (int i = 0; i < cv->num_tuples; ++i) {
-        const Tuple* item = reinterpret_cast<Tuple*>(coll_data);
-        if (UNLIKELY(!CopyStrings(item, item_desc.string_slots(), data, data_end))) {
-          return false;
-        }
-        if (UNLIKELY(
-                !CopyCollections(item, item_desc.collection_slots(), data, data_end))) {
-          return false;
-        }
-        coll_data += item_desc.byte_size();
-      }
-    }
-  }
-  return true;
-}
-
-void BufferedTupleStreamV2::GetTupleRow(FlatRowPtr flat_row, TupleRow* row) const {
-  DCHECK(row != nullptr);
-  DCHECK(!closed_);
-  DCHECK(is_pinned());
-  DCHECK(!delete_on_read_);
-  uint8_t* data = flat_row;
-  return has_nullable_tuple_ ? UnflattenTupleRow<true>(&data, row) :
-                               UnflattenTupleRow<false>(&data, row);
-}
-
-template <bool HAS_NULLABLE_TUPLE>
-void BufferedTupleStreamV2::UnflattenTupleRow(uint8_t** data, TupleRow* row) const {
-  const int tuples_per_row = desc_->tuple_descriptors().size();
-  uint8_t* ptr = *data;
-  if (has_nullable_tuple_) {
-    // Stitch together the tuples from the page and the NULL ones.
-    const uint8_t* null_indicators = ptr;
-    ptr += NullIndicatorBytesPerRow();
-    for (int i = 0; i < tuples_per_row; ++i) {
-      const uint8_t* null_word = null_indicators + (i >> 3);
-      const uint32_t null_pos = i & 7;
-      const bool is_not_null = ((*null_word & (1 << (7 - null_pos))) == 0);
-      row->SetTuple(
-          i, reinterpret_cast<Tuple*>(reinterpret_cast<uint64_t>(ptr) * is_not_null));
-      ptr += fixed_tuple_sizes_[i] * is_not_null;
-    }
-  } else {
-    for (int i = 0; i < tuples_per_row; ++i) {
-      row->SetTuple(i, reinterpret_cast<Tuple*>(ptr));
-      ptr += fixed_tuple_sizes_[i];
-    }
-  }
-  *data = ptr;
-}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/runtime/buffered-tuple-stream-v2.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-tuple-stream-v2.h b/be/src/runtime/buffered-tuple-stream-v2.h
deleted file mode 100644
index 2023124..0000000
--- a/be/src/runtime/buffered-tuple-stream-v2.h
+++ /dev/null
@@ -1,705 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef IMPALA_RUNTIME_BUFFERED_TUPLE_STREAM_V2_H
-#define IMPALA_RUNTIME_BUFFERED_TUPLE_STREAM_V2_H
-
-#include <set>
-#include <vector>
-#include <boost/scoped_ptr.hpp>
-#include <boost/function.hpp>
-
-#include "common/global-types.h"
-#include "common/status.h"
-#include "gutil/macros.h"
-#include "runtime/bufferpool/buffer-pool.h"
-#include "runtime/row-batch.h"
-
-namespace impala {
-
-class MemTracker;
-class RuntimeState;
-class RowDescriptor;
-class SlotDescriptor;
-class Tuple;
-class TupleRow;
-
-/// Class that provides an abstraction for a stream of tuple rows backed by BufferPool
-/// Pages. Rows can be added to the stream and read back. Rows are returned in the order
-/// they are added.
-///
-/// The BufferedTupleStream is *not* thread safe from the caller's point of view.
-/// Different threads should not concurrently call methods of the same BufferedTupleStream
-/// object.
-///
-/// Reading and writing the stream:
-/// The stream supports two modes of reading/writing, depending on whether
-/// PrepareForWrite() is called to initialize a write iterator only or
-/// PrepareForReadWrite() is called to initialize both read and write iterators to enable
-/// interleaved reads and writes.
-///
-/// To use write-only mode, PrepareForWrite() is called once and AddRow()/AddRowCustom*()
-/// are called repeatedly to initialize then advance a write iterator through the stream.
-/// Once the stream is fully written, it can be read back by calling PrepareForRead()
-/// then GetNext() repeatedly to advance a read iterator through the stream, or by
-/// calling GetRows() to get all of the rows at once.
-///
-/// To use read/write mode, PrepareForReadWrite() is called once to initialize the read
-/// and write iterators. AddRow()/AddRowCustom*() then advance a write iterator through
-/// the stream, and GetNext() advances a trailing read iterator through the stream.
-///
-/// Buffer management:
-/// The tuple stream is backed by a sequence of BufferPool Pages. The tuple stream uses
-/// the client's reservation to pin pages in memory. It will automatically try to
-/// increase the client's reservation whenever it needs to do so to make progress.
-///
-/// Normally pages are all of the same default page length, but larger pages up to the
-/// max page length are used if needed to store rows that are too large for a
-/// default-length page.
-///
-/// The stream has both pinned and unpinned modes. In the pinned mode all pages are
-/// pinned for reading. The pinned mode avoids I/O by keeping all pages pinned in memory
-/// and allows clients to save pointers to rows in the stream and randomly access them.
-/// E.g. hash tables can be backed by a BufferedTupleStream. In the unpinned mode, only
-/// pages currently being read and written are pinned and other pages are unpinned and
-/// therefore do not use the client's reservation and can be spilled to disk. The stream
-/// always holds onto a default page's worth of reservation for the read and write
-/// iterators (i.e. two page's worth if the stream is in read/write mode), even if that
-/// many pages are not currently pinned. This means that UnpinStream() always succeeds,
-/// and moving to the next default-length write page or read page on an unpinned stream
-/// does not require additional reservation. This is implemented by saving reservations
-/// in SubReservations.
-///
-/// To read or write a row larger than the default page size to/from an unpinned stream,
-/// the client must have max_page_len - default_page_len unused reservation. Writing a
-/// large row to an unpinned stream only uses the reservation for the duration of the
-/// AddRow()/AddRowCustom*() call. Reading a large row from an unpinned stream uses the
-/// reservation until the next call to GetNext(). E.g. to partition a single unpinned
-/// stream into n unpinned streams, the reservation needed is (n - 1) *
-/// default_page_len + 2 * max_page_len: one large read buffer and one large write
-/// buffer is needed to keep the row being processed in-memory, but only default-sized
-/// buffers are needed for the other streams being written.
-///
-/// The tuple stream also supports a 'delete_on_read' mode, enabled by passing a flag
-/// to PrepareForRead() which deletes the stream's pages as it does a final read
-/// pass over the stream.
-///
-/// TODO: IMPALA-4179: the buffer management can be simplified once we can attach
-/// buffers to RowBatches.
-///
-/// Page layout:
-/// Rows are stored back to back starting at the first byte of each page's buffer, with
-/// no interleaving of data from different rows. There is no padding or alignment
-/// between rows. Rows larger than the default page length are stored on their own
-/// page.
-///
-/// Tuple row layout:
-/// If the stream's tuples are nullable (i.e. has_nullable_tuple_ is true), there is a
-/// bitstring at the start of each row with null indicators for all tuples in each row
-/// (including non-nullable tuples). The bitstring occupies ceil(num_tuples_per_row / 8)
-/// bytes. A 1 indicates the tuple is null.
-///
-/// The fixed length parts of the row's tuples are stored first, followed by var len data
-/// for inlined_string_slots_ and inlined_coll_slots_. Other "external" var len slots can
-/// point to var len data outside the stream. When reading the stream, the length of each
-/// row's var len data in the stream must be computed to find the next row's start.
-///
-/// The tuple stream supports reading from the stream into RowBatches without copying
-/// out any data: the RowBatches' Tuple pointers will point directly into the stream's
-/// pages' buffers. The fixed length parts follow Impala's internal tuple format, so for
-/// the tuple to be valid, we only need to update pointers to point to the var len data
-/// in the stream. These pointers need to be updated by the stream because a spilled
-/// page's data may be relocated to a different buffer. The pointers are updated lazily
-/// upon reading the stream via GetNext() or GetRows().
-///
-/// Example layout for a row with two non-nullable tuples ((1, "hello"), (2, "world"))
-/// with all var len data stored in the stream:
-///  <---- tuple 1 -----> <------ tuple 2 ------> <- var len -> <- next row ...
-/// +--------+-----------+-----------+-----------+-------------+
-/// | IntVal | StringVal | BigIntVal | StringVal |             | ...
-/// +--------+-----------+-----------+-----------++------------+
-/// | val: 1 | len: 5    | val: 2    | len: 5    | helloworld  | ...
-/// |        | ptr: 0x.. |           | ptr: 0x.. |             | ...
-/// +--------+-----------+-----------+-----------+-------------+
-///  <--4b--> <---12b---> <----8b---> <---12b---> <----10b---->
-///
-/// Example layout for a row with the second tuple nullable ((1, "hello"), NULL)
-/// with all var len data stored in the stream:
-/// <- null tuple bitstring -> <---- tuple 1 -----> <- var len -> <- next row ...
-/// +-------------------------+--------+-----------+------------+
-/// |                         | IntVal | StringVal |            | ...
-/// +-------------------------+--------+-----------+------------+
-/// | 0000 0010               | val: 1 | len: 5    | hello      | ...
-/// |                         |        | ptr: 0x.. |            | ...
-/// +-------------------------+--------+-----------+------------+
-///  <---------1b------------> <--4b--> <---12b---> <----5b---->
-///
-/// Example layout for a row with a single non-nullable tuple (("hello", "world")) with
-/// the second string slot stored externally to the stream:
-///  <------ tuple 1 ------> <- var len ->  <- next row ...
-/// +-----------+-----------+-------------+
-/// | StringVal | StringVal |             | ...
-/// +-----------+-----------+-------------+
-/// | len: 5    | len: 5    |  hello      | ...
-/// | ptr: 0x.. | ptr: 0x.. |             | ...
-/// +-----------+-----------+-------------+
-///  <---12b---> <---12b---> <-----5b---->
-///
-/// The behavior of reads and writes is as follows:
-/// Read:
-///   1. Unpinned: Only a single read page is pinned at a time. This means that only
-///     enough reservation to pin a single page is needed to read the stream, regardless
-///     of the stream's size. Each page is deleted or unpinned (if delete on read is true
-///     or false respectively) before advancing to the next page.
-///   2. Pinned: All pages in the stream are pinned so do not need to be pinned or
-///     unpinned when reading from the stream. If delete on read is true, pages are
-///     deleted after being read. If the stream was previously unpinned, the page's data
-///     may not yet be in memory - reading from the stream can block on I/O or fail with
-///     an I/O error.
-/// Write:
-///   1. Unpinned: Unpin pages as they fill up. This means that only a enough reservation
-///     to pin a single write page is required to write to the stream, regardless of the
-///     stream's size.
-///   2. Pinned: Pages are left pinned. If the next page in the stream cannot be pinned
-///     because the client's reservation is insufficient (and could not be increased by
-///     the stream), the read call will fail and the client can either unpin the stream
-///     or free up other memory before retrying.
-///
-/// Memory lifetime of rows read from stream:
-/// If the stream is pinned and delete on read is false, it is valid to access any tuples
-/// returned via GetNext() or GetRows() until the stream is unpinned. If the stream is
-/// unpinned or delete on read is true, then the batch returned from GetNext() may have
-/// the needs_deep_copy flag set, which means that any tuple memory returned so far from
-/// the stream may be freed on the next call to GetNext().
-/// TODO: IMPALA-4179, instead of needs_deep_copy, attach the pages' buffers to the batch.
-///
-/// Manual construction of rows with AddRowCustomBegin()/AddRowCustomEnd():
-/// The BufferedTupleStream supports allocation of uninitialized rows with
-/// AddRowCustom*(). AddRowCustomBegin() is called instead of AddRow() if the client wants
-/// to manually construct a row. The caller of AddRowCustomBegin() is responsible for
-/// writing the row with exactly the layout described above then calling
-/// AddRowCustomEnd() when done.
-///
-/// If a caller constructs a tuple in this way, the caller can set the pointers and they
-/// will not be modified until the stream is read via GetNext() or GetRows().
-/// TODO: IMPALA-5007: try to remove AddRowCustom*() by unifying with AddRow().
-///
-/// TODO: we need to be able to do read ahead for pages. We need some way to indicate a
-/// page will need to be pinned soon.
-class BufferedTupleStreamV2 {
- public:
-  /// A pointer to the start of a flattened TupleRow in the stream.
-  typedef uint8_t* FlatRowPtr;
-
-  /// row_desc: description of rows stored in the stream. This is the desc for rows
-  /// that are added and the rows being returned.
-  /// page_len: the size of pages to use in the stream
-  /// ext_varlen_slots: set of varlen slots with data stored externally to the stream
-  BufferedTupleStreamV2(RuntimeState* state, const RowDescriptor* row_desc,
-      BufferPool::ClientHandle* buffer_pool_client, int64_t default_page_len,
-      int64_t max_page_len,
-      const std::set<SlotId>& ext_varlen_slots = std::set<SlotId>());
-
-  virtual ~BufferedTupleStreamV2();
-
-  /// Initializes the tuple stream object on behalf of node 'node_id'. Must be called
-  /// once before any of the other APIs.
-  /// If 'pinned' is true, the tuple stream starts off pinned, otherwise it is unpinned.
-  /// 'node_id' is only used for error reporting.
-  Status Init(int node_id, bool pinned) WARN_UNUSED_RESULT;
-
-  /// Prepares the stream for writing by saving enough reservation for a default-size
-  /// write page. Tries to increase reservation if there is not enough unused reservation
-  /// for a page. Called after Init() and before the first AddRow() or
-  /// AddRowCustomBegin() call.
-  /// 'got_reservation': set to true if there was enough reservation to initialize the
-  ///     first write page and false if there was not enough reservation and no other
-  ///     error was encountered. Undefined if an error status is returned.
-  Status PrepareForWrite(bool* got_reservation) WARN_UNUSED_RESULT;
-
-  /// Prepares the stream for interleaved reads and writes by saving enough reservation
-  /// for default-sized read and write pages. Called after Init() and before the first
-  /// AddRow() or AddRowCustomBegin() call.
-  /// 'delete_on_read': Pages are deleted after they are read.
-  /// 'got_reservation': set to true if there was enough reservation to initialize the
-  ///     read and write pages and false if there was not enough reservation and no other
-  ///     error was encountered. Undefined if an error status is returned.
-  Status PrepareForReadWrite(
-      bool delete_on_read, bool* got_reservation) WARN_UNUSED_RESULT;
-
-  /// Prepares the stream for reading, invalidating the write iterator (if there is one).
-  /// Therefore must be called after the last AddRow() or AddRowCustomEnd() and before
-  /// GetNext(). PrepareForRead() can be called multiple times to do multiple read passes
-  /// over the stream, unless rows were read from the stream after PrepareForRead() or
-  /// PrepareForReadWrite() was called with delete_on_read = true.
-  /// 'delete_on_read': Pages are deleted after they are read.
-  /// 'got_reservation': set to true if there was enough reservation to initialize the
-  ///     first read page and false if there was not enough reservation and no other
-  ///     error was encountered. Undefined if an error status is returned.
-  Status PrepareForRead(bool delete_on_read, bool* got_reservation) WARN_UNUSED_RESULT;
-
-  /// Adds a single row to the stream. There are three possible outcomes:
-  /// a) The append succeeds. True is returned.
-  /// b) The append fails because the unused reservation was not sufficient to add
-  ///   a new page to the stream large enough to fit 'row' and the stream could not
-  ///   increase the reservation to get enough unused reservation. Returns false and
-  ///   sets 'status' to OK. The append can be retried after freeing up memory or
-  ///   unpinning the stream.
-  /// c) The append fails with a runtime error. Returns false and sets 'status' to an
-  ///   error.
-  /// d) The append fails becase the row is too large to fit in a page of a stream.
-  ///   Returns false and sets 'status' to an error.
-  ///
-  /// Unpinned streams can only encounter case b) when appending a row larger than
-  /// the default page size and the reservation could not be increased sufficiently.
-  /// Otherwise enough memory is automatically freed up by unpinning the current write
-  /// page.
-  ///
-  /// BufferedTupleStream will do a deep copy of the memory in the row. After AddRow()
-  /// returns an error, it should not be called again.
-  bool AddRow(TupleRow* row, Status* status) noexcept WARN_UNUSED_RESULT;
-
-  /// Allocates space to store a row of 'size' bytes (including fixed and variable length
-  /// data). If successful, returns a pointer to the allocated row. The caller then must
-  /// writes valid data to the row and call AddRowCustomEnd().
-  ///
-  /// If unsuccessful, returns nullptr. The failure modes are the same as described in the
-  /// AddRow() comment.
-  ALWAYS_INLINE uint8_t* AddRowCustomBegin(int64_t size, Status* status);
-
-  /// Called after AddRowCustomBegin() when done writing the row. Only should be called
-  /// if AddRowCustomBegin() succeeded. See the AddRowCustomBegin() comment for
-  /// explanation.
-  /// 'size': the size passed into AddRowCustomBegin().
-  void AddRowCustomEnd(int64_t size);
-
-  /// Unflattens 'flat_row' into a regular TupleRow 'row'. Only valid to call if the
-  /// stream is pinned. The row must have been allocated with the stream's row desc.
-  /// The returned 'row' is backed by memory from the stream so is only valid as long
-  /// as the stream is pinned.
-  void GetTupleRow(FlatRowPtr flat_row, TupleRow* row) const;
-
-  /// Pins all pages in this stream and switches to pinned mode. Has no effect if the
-  /// stream is already pinned.
-  /// If the current unused reservation is not sufficient to pin the stream in memory,
-  /// this will try to increase the reservation. If that fails, 'pinned' is set to false
-  /// and the stream is left unpinned. Otherwise 'pinned' is set to true.
-  Status PinStream(bool* pinned) WARN_UNUSED_RESULT;
-
-  /// Modes for UnpinStream().
-  enum UnpinMode {
-    /// All pages in the stream are unpinned and the read/write positions in the stream
-    /// are reset. No more rows can be written to the stream after this. The stream can
-    /// be re-read from the beginning by calling PrepareForRead().
-    UNPIN_ALL,
-    /// All pages are unpinned aside from the current read and write pages (if any),
-    /// which is left in the same state. The unpinned stream can continue being read
-    /// or written from the current read or write positions.
-    UNPIN_ALL_EXCEPT_CURRENT,
-  };
-
-  /// Unpins stream with the given 'mode' as described above.
-  void UnpinStream(UnpinMode mode);
-
-  /// Get the next batch of output rows, which are backed by the stream's memory.
-  /// If the stream is unpinned or 'delete_on_read' is true, the 'needs_deep_copy'
-  /// flag may be set on 'batch' to signal that memory will be freed on the next
-  /// call to GetNext() and that the caller should copy out any data it needs from
-  /// rows in 'batch' or in previous batches returned from GetNext().
-  ///
-  /// If the stream is pinned and 'delete_on_read' is false, the memory backing the
-  /// rows will remain valid until the stream is unpinned, destroyed, etc.
-  /// TODO: IMPALA-4179: update when we simplify the memory transfer model.
-  Status GetNext(RowBatch* batch, bool* eos) WARN_UNUSED_RESULT;
-
-  /// Same as above, but populate 'flat_rows' with a pointer to the flat version of
-  /// each returned row in the pinned stream. The pointers in 'flat_rows' are only
-  /// valid as long as the stream remains pinned.
-  Status GetNext(
-      RowBatch* batch, bool* eos, std::vector<FlatRowPtr>* flat_rows) WARN_UNUSED_RESULT;
-
-  /// Returns all the rows in the stream in batch. This pins the entire stream in the
-  /// process. If the current unused reservation is not sufficient to pin the stream in
-  /// memory, this will try to increase the reservation. If that fails, 'got_rows' is set
-  /// to false.
-  Status GetRows(MemTracker* tracker, boost::scoped_ptr<RowBatch>* batch,
-      bool* got_rows) WARN_UNUSED_RESULT;
-
-  /// Must be called once at the end to cleanup all resources. If 'batch' is non-NULL,
-  /// attaches buffers from pinned pages that rows returned from GetNext() may reference.
-  /// Otherwise deletes all pages. Does nothing if the stream was already closed. The
-  /// 'flush' mode is forwarded to RowBatch::AddBuffer() when attaching buffers.
-  void Close(RowBatch* batch, RowBatch::FlushMode flush);
-
-  /// Number of rows in the stream.
-  int64_t num_rows() const { return num_rows_; }
-
-  /// Number of rows returned via GetNext().
-  int64_t rows_returned() const { return rows_returned_; }
-
-  /// Returns the byte size necessary to store the entire stream in memory.
-  int64_t byte_size() const { return total_byte_size_; }
-
-  /// Returns the number of bytes currently pinned in memory by the stream.
-  /// If ignore_current is true, the write_page_ memory is not included.
-  int64_t BytesPinned(bool ignore_current) const {
-    if (ignore_current && write_page_ != nullptr && write_page_->is_pinned()) {
-      return bytes_pinned_ - write_page_->len();
-    }
-    return bytes_pinned_;
-  }
-
-  bool is_closed() const { return closed_; }
-  bool is_pinned() const { return pinned_; }
-  bool has_read_iterator() const { return has_read_iterator_; }
-  bool has_write_iterator() const { return has_write_iterator_; }
-
-  std::string DebugString() const;
-
- private:
-  DISALLOW_COPY_AND_ASSIGN(BufferedTupleStreamV2);
-  friend class ArrayTupleStreamTest_TestArrayDeepCopy_Test;
-  friend class ArrayTupleStreamTest_TestComputeRowSize_Test;
-  friend class MultiNullableTupleStreamTest_TestComputeRowSize_Test;
-  friend class SimpleTupleStreamTest_TestGetRowsOverflow_Test;
-
-  /// Wrapper around BufferPool::PageHandle that tracks additional info about the page.
-  struct Page {
-    Page() : num_rows(0), retrieved_buffer(true) {}
-
-    inline int len() const { return handle.len(); }
-    inline bool is_pinned() const { return handle.is_pinned(); }
-    inline int pin_count() const { return handle.pin_count(); }
-    Status GetBuffer(const BufferPool::BufferHandle** buffer) {
-      RETURN_IF_ERROR(handle.GetBuffer(buffer));
-      retrieved_buffer = true;
-      return Status::OK();
-    }
-    std::string DebugString() const;
-
-    BufferPool::PageHandle handle;
-
-    /// Number of rows written to the page.
-    int num_rows;
-
-    /// Whether we called GetBuffer() on the page since it was last pinned. This means
-    /// that GetBuffer() and ExtractBuffer() cannot fail and that GetNext() may have
-    /// returned rows referencing the page's buffer.
-    bool retrieved_buffer;
-  };
-
-  /// Runtime state instance used to check for cancellation. Not owned.
-  RuntimeState* const state_;
-
-  /// Description of rows stored in the stream.
-  const RowDescriptor* desc_;
-
-  /// Plan node ID, used for error reporting.
-  int node_id_;
-
-  /// The size of the fixed length portion for each tuple in the row.
-  std::vector<int> fixed_tuple_sizes_;
-
-  /// Vectors of all the strings slots that have their varlen data stored in stream
-  /// grouped by tuple_idx.
-  std::vector<std::pair<int, std::vector<SlotDescriptor*>>> inlined_string_slots_;
-
-  /// Vectors of all the collection slots that have their varlen data stored in the
-  /// stream, grouped by tuple_idx.
-  std::vector<std::pair<int, std::vector<SlotDescriptor*>>> inlined_coll_slots_;
-
-  /// Buffer pool and client used to allocate, pin and release pages. Not owned.
-  BufferPool* buffer_pool_;
-  BufferPool::ClientHandle* buffer_pool_client_;
-
-  /// List of pages in the stream.
-  /// Empty iff one of two cases applies:
-  /// * before the first row has been added with AddRow() or AddRowCustom().
-  /// * after the stream has been destructively read in 'delete_on_read' mode
-  std::list<Page> pages_;
-  // IMPALA-5629: avoid O(n) list.size() call by explicitly tracking the number of pages.
-  // TODO: remove when we switch to GCC5+, where list.size() is O(1). See GCC bug #49561.
-  int64_t num_pages_;
-
-  /// Total size of pages_, including any pages already deleted in 'delete_on_read'
-  /// mode.
-  int64_t total_byte_size_;
-
-  /// True if there is currently an active read iterator for the stream.
-  bool has_read_iterator_;
-
-  /// The current page being read. When no read iterator is active, equal to list.end().
-  /// When a read iterator is active, either points to the current read page, or equals
-  /// list.end() if no rows have yet been read.  GetNext() does not advance this past
-  /// the end of the stream, so upon eos 'read_page_' points to the last page and
-  /// rows_returned_ == num_rows_. Always pinned, unless a Pin() call failed and an error
-  /// status was returned.
-  std::list<Page>::iterator read_page_;
-
-  /// Saved reservation for read iterator. 'default_page_len_' reservation is saved if
-  /// there is a read iterator, no pinned read page, and the possibility that the read
-  /// iterator will advance to a valid page.
-  BufferPool::SubReservation read_page_reservation_;
-
-  /// Number of rows returned from the current read_page_.
-  uint32_t read_page_rows_returned_;
-
-  /// Pointer into read_page_ to the byte after the last row read.
-  uint8_t* read_ptr_;
-
-  /// Pointer to one byte past the end of read_page_. Used to detect overruns.
-  const uint8_t* read_end_ptr_;
-
-  /// Pointer into write_page_ to the byte after the last row written.
-  uint8_t* write_ptr_;
-
-  /// Pointer to one byte past the end of write_page_. Cached to speed up computation
-  const uint8_t* write_end_ptr_;
-
-  /// Number of rows returned to the caller from GetNext() since the last
-  /// PrepareForRead() call.
-  int64_t rows_returned_;
-
-  /// True if there is currently an active write iterator into the stream.
-  bool has_write_iterator_;
-
-  /// The current page for writing. NULL if there is no write iterator or no current
-  /// write page. Always pinned. Size is 'default_page_len_', except temporarily while
-  /// appending a larger row between AddRowCustomBegin() and AddRowCustomEnd().
-  Page* write_page_;
-
-  /// Saved reservation for write iterator. 'default_page_len_' reservation is saved if
-  /// there is a write iterator, no page currently pinned for writing and the possibility
-  /// that a pin count will be needed for the write iterator in future. Specifically if:
-  /// * no rows have been appended to the stream and 'pages_' is empty, or
-  /// * the stream is unpinned, 'write_page_' is null and and the last page in 'pages_'
-  ///   is a large page that we advanced past, or
-  /// * there is only one pinned page in the stream and it is already pinned for reading.
-  BufferPool::SubReservation write_page_reservation_;
-
-  /// Total bytes of pinned pages in pages_, stored to avoid iterating over the list
-  /// to compute it.
-  int64_t bytes_pinned_;
-
-  /// Number of rows stored in the stream. Includes rows that were already deleted during
-  /// a destructive 'delete_on_read' pass over the stream.
-  int64_t num_rows_;
-
-  /// The default length in bytes of pages used to store the stream's rows. All rows that
-  /// fit in a default-sized page are stored in default-sized page.
-  const int64_t default_page_len_;
-
-  /// The maximum length in bytes of pages used to store the stream's rows. This is a
-  /// hard limit on the maximum size of row that can be stored in the stream and the
-  /// amount of reservation required to read or write to an unpinned stream.
-  const int64_t max_page_len_;
-
-  /// Whether any tuple in the rows is nullable.
-  const bool has_nullable_tuple_;
-
-  /// If true, pages are deleted after they are read during this read pass. Once rows
-  /// have been read from a stream with 'delete_on_read_' true, this is always true.
-  bool delete_on_read_;
-
-  bool closed_; // Used for debugging.
-
-  /// If true, this stream has been explicitly pinned by the caller and all pages are
-  /// kept pinned until the caller calls UnpinStream().
-  bool pinned_;
-
-  bool is_read_page(const Page* page) const {
-    return read_page_ != pages_.end() && &*read_page_ == page;
-  }
-
-  bool is_write_page(const Page* page) const { return write_page_ == page; }
-
-  /// Return true if the read and write page are the same.
-  bool has_read_write_page() const {
-    return write_page_ != nullptr && is_read_page(write_page_);
-  }
-
-  /// The slow path for AddRow() that is called if there is not sufficient space in
-  /// the current page.
-  bool AddRowSlow(TupleRow* row, Status* status) noexcept;
-
-  /// The slow path for AddRowCustomBegin() that is called if there is not sufficient space in
-  /// the current page.
-  uint8_t* AddRowCustomBeginSlow(int64_t size, Status* status) noexcept;
-
-  /// The slow path for AddRowCustomEnd() that is called for large pages.
-  void AddLargeRowCustomEnd(int64_t size) noexcept;
-
-  /// Copies 'row' into the buffer starting at *data and ending at the byte before
-  /// 'data_end'. On success, returns true and updates *data to point after the last
-  /// byte written. Returns false if there is not enough space in the buffer provided.
-  bool DeepCopy(TupleRow* row, uint8_t** data, const uint8_t* data_end) noexcept;
-
-  /// Templated implementation of DeepCopy().
-  template <bool HAS_NULLABLE_TUPLE>
-  bool DeepCopyInternal(TupleRow* row, uint8_t** data, const uint8_t* data_end) noexcept;
-
-  /// Helper function to copy strings in string_slots from tuple into *data.
-  /// Updates *data to the end of the string data added. Returns false if the data
-  /// does not fit in the buffer [*data, data_end).
-  static bool CopyStrings(const Tuple* tuple,
-      const std::vector<SlotDescriptor*>& string_slots, uint8_t** data,
-      const uint8_t* data_end);
-
-  /// Helper function to deep copy collections in collection_slots from tuple into
-  /// the buffer [*data, data_end). Updates *data to the end of the collection data
-  /// added. Returns false if the data does not fit in the buffer.
-  static bool CopyCollections(const Tuple* tuple,
-      const std::vector<SlotDescriptor*>& collection_slots, uint8_t** data,
-      const uint8_t* data_end);
-
-  /// Gets a new page of 'page_len' bytes from buffer_pool_, updating write_page_,
-  /// write_ptr_ and write_end_ptr_. The caller must ensure there is 'page_len' unused
-  /// reservation. The caller must reset the write page (if there is one) before calling.
-  Status NewWritePage(int64_t page_len) noexcept WARN_UNUSED_RESULT;
-
-  /// Determines what page size is needed to fit a row of 'row_size' bytes.
-  /// Returns an error if the row cannot fit in a page.
-  Status CalcPageLenForRow(int64_t row_size, int64_t* page_len);
-
-  /// Wrapper around NewWritePage() that allocates a new write page that fits a row of
-  /// 'row_size' bytes. Increases reservation if needed to allocate the next page.
-  /// Returns OK and sets 'got_reservation' to true if the write page was successfully
-  /// allocated. Returns an error if the row cannot fit in a page. Returns OK and sets
-  /// 'got_reservation' to false if the reservation could not be increased and no other
-  /// error was encountered.
-  Status AdvanceWritePage(
-      int64_t row_size, bool* got_reservation) noexcept WARN_UNUSED_RESULT;
-
-  /// Reset the write page, if there is one, and unpin pages accordingly. If there
-  /// is an active write iterator, the next row will be appended to a new page.
-  void ResetWritePage();
-
-  /// Invalidate the write iterator and release any resources associated with it. After
-  /// calling this, no more rows can be appended to the stream.
-  void InvalidateWriteIterator();
-
-  /// Same as PrepareForRead(), except the iterators are not invalidated and
-  /// the caller is assumed to have checked there is sufficient unused reservation.
-  Status PrepareForReadInternal(bool delete_on_read) WARN_UNUSED_RESULT;
-
-  /// Pins the next read page. This blocks reading from disk if necessary to bring the
-  /// page's data into memory. Updates read_page_, read_ptr_, and
-  /// read_page_rows_returned_.
-  Status NextReadPage() WARN_UNUSED_RESULT;
-
-  /// Invalidate the read iterator, and release any resources associated with the active
-  /// iterator.
-  void InvalidateReadIterator();
-
-  /// Returns the total additional bytes that this row will consume in write_page_ if
-  /// appended to the page. This includes the row's null indicators, the fixed length
-  /// part of the row and the data for inlined_string_slots_ and inlined_coll_slots_.
-  int64_t ComputeRowSize(TupleRow* row) const noexcept;
-
-  /// Pins page and updates tracking stats.
-  Status PinPage(Page* page) WARN_UNUSED_RESULT;
-
-  /// Increment the page's pin count if this page needs a higher pin count given the
-  /// current read and write iterator positions and whether the stream will be pinned
-  /// ('stream_pinned'). Assumes that no scenarios occur when the pin count needs to
-  /// be incremented multiple times. The caller is responsible for ensuring sufficient
-  /// reservation is available.
-  Status PinPageIfNeeded(Page* page, bool stream_pinned) WARN_UNUSED_RESULT;
-
-  /// Decrement the page's pin count if this page needs a lower pin count given the
-  /// current read and write iterator positions and whether the stream will be pinned
-  /// ('stream_pinned'). Assumes that no scenarios occur when the pin count needs to
-  /// be decremented multiple times.
-  void UnpinPageIfNeeded(Page* page, bool stream_pinned);
-
-  /// Return the expected pin count for 'page' in the current stream based on the current
-  /// read and write pages and whether the stream is pinned.
-  int ExpectedPinCount(bool stream_pinned, const Page* page) const;
-
-  /// Return true if the stream in its current state needs to have a reservation for
-  /// a write page stored in 'write_page_reservation_'.
-  bool NeedWriteReservation() const;
-
-  /// Same as above, except assume the stream's 'pinned_' state is 'stream_pinned'.
-  bool NeedWriteReservation(bool stream_pinned) const;
-
-  /// Same as above, except assume the stream has 'num_pages' pages and different
-  /// iterator state.
-  static bool NeedWriteReservation(bool stream_pinned, int64_t num_pages,
-      bool has_write_iterator, bool has_write_page, bool has_read_write_page);
-
-  /// Return true if the stream in its current state needs to have a reservation for
-  /// a read page stored in 'read_page_reservation_'.
-  bool NeedReadReservation() const;
-
-  /// Same as above, except assume the stream's 'pinned_' state is 'stream_pinned'.
-  bool NeedReadReservation(bool stream_pinned) const;
-
-  /// Same as above, except assume the stream has 'num_pages' pages and a different
-  /// read iterator state.
-  bool NeedReadReservation(bool stream_pinned, int64_t num_pages, bool has_read_iterator,
-      bool has_read_page) const;
-
-  /// Same as above, except assume the stream has 'num_pages' pages and a different
-  /// write iterator state.
-  static bool NeedReadReservation(bool stream_pinned, int64_t num_pages,
-      bool has_read_iterator, bool has_read_page, bool has_write_iterator,
-      bool has_write_page);
-
-  /// Templated GetNext implementations.
-  template <bool FILL_FLAT_ROWS>
-  Status GetNextInternal(RowBatch* batch, bool* eos, std::vector<FlatRowPtr>* flat_rows);
-  template <bool FILL_FLAT_ROWS, bool HAS_NULLABLE_TUPLE>
-  Status GetNextInternal(RowBatch* batch, bool* eos, std::vector<FlatRowPtr>* flat_rows);
-
-  /// Helper function to convert a flattened TupleRow stored starting at '*data' into
-  /// 'row'. *data is updated to point to the first byte past the end of the row.
-  template <bool HAS_NULLABLE_TUPLE>
-  void UnflattenTupleRow(uint8_t** data, TupleRow* row) const;
-
-  /// Helper function for GetNextInternal(). For each string slot in string_slots,
-  /// update StringValue's ptr field to point to the corresponding string data stored
-  /// inline in the stream (at the current value of read_ptr_) advance read_ptr_ by the
-  /// StringValue's length field.
-  void FixUpStringsForRead(const vector<SlotDescriptor*>& string_slots, Tuple* tuple);
-
-  /// Helper function for GetNextInternal(). For each collection slot in collection_slots,
-  /// recursively update any pointers in the CollectionValue to point to the corresponding
-  /// var len data stored inline in the stream, advancing read_ptr_ as data is read.
-  /// Assumes that the collection was serialized to the stream in DeepCopy()'s format.
-  void FixUpCollectionsForRead(
-      const vector<SlotDescriptor*>& collection_slots, Tuple* tuple);
-
-  /// Returns the number of null indicator bytes per row. Only valid if this stream has
-  /// nullable tuples.
-  int NullIndicatorBytesPerRow() const;
-
-  /// Returns the total bytes pinned. Only called in DCHECKs to validate bytes_pinned_.
-  int64_t CalcBytesPinned() const;
-
-  /// DCHECKs if the stream is internally inconsistent. The stream should always be in
-  /// a consistent state after returning success from a public API call. The Fast version
-  /// has constant runtime and does not check all of 'pages_'. The Full version includes
-  /// O(n) checks that require iterating over the whole 'pages_' list (e.g. checking that
-  /// each page is in a valid state).
-  void CheckConsistencyFast() const;
-  void CheckConsistencyFull() const;
-  void CheckPageConsistency(const Page* page) const;
-};
-}
-
-#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/runtime/buffered-tuple-stream-v2.inline.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-tuple-stream-v2.inline.h b/be/src/runtime/buffered-tuple-stream-v2.inline.h
deleted file mode 100644
index 7022249..0000000
--- a/be/src/runtime/buffered-tuple-stream-v2.inline.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef IMPALA_RUNTIME_BUFFERED_TUPLE_STREAM_V2_INLINE_H
-#define IMPALA_RUNTIME_BUFFERED_TUPLE_STREAM_V2_INLINE_H
-
-#include "runtime/buffered-tuple-stream-v2.h"
-
-#include "runtime/descriptors.h"
-#include "runtime/tuple-row.h"
-#include "util/bit-util.h"
-
-namespace impala {
-
-inline int BufferedTupleStreamV2::NullIndicatorBytesPerRow() const {
-  DCHECK(has_nullable_tuple_);
-  return BitUtil::RoundUpNumBytes(fixed_tuple_sizes_.size());
-}
-
-inline uint8_t* BufferedTupleStreamV2::AddRowCustomBegin(int64_t size, Status* status) {
-  DCHECK(!closed_);
-  DCHECK(has_write_iterator());
-  if (UNLIKELY(write_page_ == nullptr || write_ptr_ + size > write_end_ptr_)) {
-    return AddRowCustomBeginSlow(size, status);
-  }
-  DCHECK(write_page_ != nullptr);
-  DCHECK(write_page_->is_pinned());
-  DCHECK_LE(write_ptr_ + size, write_end_ptr_);
-  ++num_rows_;
-  ++write_page_->num_rows;
-
-  uint8_t* data = write_ptr_;
-  write_ptr_ += size;
-  return data;
-}
-
-inline void BufferedTupleStreamV2::AddRowCustomEnd(int64_t size) {
-  if (UNLIKELY(size > default_page_len_)) AddLargeRowCustomEnd(size);
-}
-}
-
-#endif


[07/11] incubator-impala git commit: IMPALA-4674: Part 2.5: Rename BufferedTupleStreamV2

Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0c46147e/be/src/runtime/buffered-tuple-stream-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-tuple-stream-test.cc b/be/src/runtime/buffered-tuple-stream-test.cc
new file mode 100644
index 0000000..9fe0618
--- /dev/null
+++ b/be/src/runtime/buffered-tuple-stream-test.cc
@@ -0,0 +1,1462 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <boost/bind.hpp>
+#include <boost/filesystem.hpp>
+#include <boost/scoped_ptr.hpp>
+
+#include <limits> // for std::numeric_limits<int>::max()
+#include <set>
+#include <string>
+
+#include "codegen/llvm-codegen.h"
+#include "gutil/gscoped_ptr.h"
+#include "runtime/buffered-tuple-stream.inline.h"
+#include "runtime/query-state.h"
+#include "runtime/bufferpool/reservation-tracker.h"
+#include "runtime/collection-value-builder.h"
+#include "runtime/collection-value.h"
+#include "runtime/raw-value.h"
+#include "runtime/row-batch.h"
+#include "runtime/string-value.inline.h"
+#include "runtime/test-env.h"
+#include "runtime/tmp-file-mgr.h"
+#include "service/fe-support.h"
+#include "testutil/desc-tbl-builder.h"
+#include "testutil/gtest-util.h"
+#include "util/test-info.h"
+
+#include "gen-cpp/ImpalaInternalService_types.h"
+#include "gen-cpp/Types_types.h"
+
+#include "common/names.h"
+
+using kudu::FreeDeleter;
+using std::numeric_limits;
+
+static const int BATCH_SIZE = 250;
+// Allow arbitrarily small pages in our test buffer pool.
+static const int MIN_PAGE_LEN = 1;
+// Limit the size of the buffer pool to bound memory consumption.
+static const int64_t BUFFER_POOL_LIMIT = 1024L * 1024L * 1024L;
+
+// The page length to use for the streams.
+static const int PAGE_LEN = 2 * 1024 * 1024;
+static const uint32_t PRIME = 479001599;
+
+namespace impala {
+
+static const StringValue STRINGS[] = {
+    StringValue("ABC"), StringValue("HELLO"), StringValue("123456789"),
+    StringValue("FOOBAR"), StringValue("ONE"), StringValue("THREE"),
+    StringValue("abcdefghijklmno"), StringValue("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
+    StringValue("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"),
+};
+
+static const int NUM_STRINGS = sizeof(STRINGS) / sizeof(StringValue);
+
+class SimpleTupleStreamTest : public testing::Test {
+ protected:
+  virtual void SetUp() {}
+
+  virtual void CreateDescriptors() {
+    vector<bool> nullable_tuples(1, false);
+    vector<TTupleId> tuple_ids(1, static_cast<TTupleId>(0));
+
+    DescriptorTblBuilder int_builder(test_env_->exec_env()->frontend(), &pool_);
+    int_builder.DeclareTuple() << TYPE_INT;
+    int_desc_ =
+        pool_.Add(new RowDescriptor(*int_builder.Build(), tuple_ids, nullable_tuples));
+
+    DescriptorTblBuilder string_builder(test_env_->exec_env()->frontend(), &pool_);
+    string_builder.DeclareTuple() << TYPE_STRING;
+    string_desc_ =
+        pool_.Add(new RowDescriptor(*string_builder.Build(), tuple_ids, nullable_tuples));
+
+    // Construct descriptors for big rows with and without nullable tuples.
+    // Each tuple contains 8 slots of TYPE_INT and a single byte for null indicator.
+    DescriptorTblBuilder big_row_builder(test_env_->exec_env()->frontend(), &pool_);
+    tuple_ids.clear();
+    nullable_tuples.clear();
+    vector<bool> non_nullable_tuples;
+    const int num_tuples = BIG_ROW_BYTES / (8 * sizeof(int) + 1);
+    for (int tuple_idx = 0; tuple_idx < num_tuples; ++tuple_idx) {
+      big_row_builder.DeclareTuple() << TYPE_INT << TYPE_INT << TYPE_INT << TYPE_INT
+                                     << TYPE_INT << TYPE_INT << TYPE_INT << TYPE_INT;
+      tuple_ids.push_back(static_cast<TTupleId>(tuple_idx));
+      nullable_tuples.push_back(true);
+      non_nullable_tuples.push_back(false);
+    }
+    big_row_desc_ = pool_.Add(
+        new RowDescriptor(*big_row_builder.Build(), tuple_ids, non_nullable_tuples));
+    ASSERT_FALSE(big_row_desc_->IsAnyTupleNullable());
+    nullable_big_row_desc_ = pool_.Add(
+        new RowDescriptor(*big_row_builder.Build(), tuple_ids, nullable_tuples));
+  }
+
+  virtual void TearDown() {
+    if (client_.is_registered()) {
+      test_env_->exec_env()->buffer_pool()->DeregisterClient(&client_);
+    }
+    runtime_state_ = nullptr;
+    pool_.Clear();
+    mem_pool_->FreeAll();
+    test_env_.reset();
+  }
+
+  /// Set up all of the test state: the buffer pool, a query state, a client with no
+  /// reservation and any other descriptors, etc.
+  /// The buffer pool's capacity is limited to 'buffer_pool_limit'.
+  void Init(int64_t buffer_pool_limit) {
+    test_env_.reset(new TestEnv());
+    test_env_->SetBufferPoolArgs(MIN_PAGE_LEN, buffer_pool_limit);
+    ASSERT_OK(test_env_->Init());
+
+    CreateDescriptors();
+    mem_pool_.reset(new MemPool(&tracker_));
+
+    ASSERT_OK(test_env_->CreateQueryState(0, nullptr, &runtime_state_));
+    query_state_ = runtime_state_->query_state();
+
+    RuntimeProfile* client_profile = pool_.Add(new RuntimeProfile(&pool_, "client"));
+    MemTracker* client_tracker =
+        pool_.Add(new MemTracker(-1, "client", runtime_state_->instance_mem_tracker()));
+    ASSERT_OK(test_env_->exec_env()->buffer_pool()->RegisterClient("",
+        query_state_->file_group(), runtime_state_->instance_buffer_reservation(),
+        client_tracker, numeric_limits<int>::max(), client_profile, &client_));
+  }
+
+  /// Generate the ith element of a sequence of int values.
+  int GenIntValue(int i) {
+    // Multiply by large prime to get varied bit patterns.
+    return i * PRIME;
+  }
+
+  /// Generate the ith element of a sequence of bool values.
+  bool GenBoolValue(int i) {
+    // Use a middle bit of the int value.
+    return ((GenIntValue(i) >> 8) & 0x1) != 0;
+  }
+
+  /// Count the total number of slots per row based on the given row descriptor.
+  int CountSlotsPerRow(const RowDescriptor& row_desc) {
+    int slots_per_row = 0;
+    for (int i = 0; i < row_desc.tuple_descriptors().size(); ++i) {
+      TupleDescriptor* tuple_desc = row_desc.tuple_descriptors()[i];
+      slots_per_row += tuple_desc->slots().size();
+    }
+    return slots_per_row;
+  }
+
+  /// Allocate a row batch with 'num_rows' of rows with layout described by 'row_desc'.
+  /// 'offset' is used to account for rows occupied by any previous row batches. This is
+  /// needed to match the values generated in VerifyResults(). If 'gen_null' is true,
+  /// some tuples will be set to NULL.
+  virtual RowBatch* CreateBatch(
+      const RowDescriptor* row_desc, int offset, int num_rows, bool gen_null) {
+    RowBatch* batch = pool_.Add(new RowBatch(row_desc, num_rows, &tracker_));
+    int num_tuples = row_desc->tuple_descriptors().size();
+
+    int idx = offset * CountSlotsPerRow(*row_desc);
+    for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
+      TupleRow* row = batch->GetRow(row_idx);
+      for (int tuple_idx = 0; tuple_idx < num_tuples; ++tuple_idx) {
+        TupleDescriptor* tuple_desc = row_desc->tuple_descriptors()[tuple_idx];
+        Tuple* tuple = Tuple::Create(tuple_desc->byte_size(), batch->tuple_data_pool());
+        bool is_null = gen_null && !GenBoolValue(idx);
+        for (int slot_idx = 0; slot_idx < tuple_desc->slots().size(); ++slot_idx, ++idx) {
+          SlotDescriptor* slot_desc = tuple_desc->slots()[slot_idx];
+          void* slot = tuple->GetSlot(slot_desc->tuple_offset());
+          switch (slot_desc->type().type) {
+            case TYPE_INT:
+              *reinterpret_cast<int*>(slot) = GenIntValue(idx);
+              break;
+            case TYPE_STRING:
+              *reinterpret_cast<StringValue*>(slot) = STRINGS[idx % NUM_STRINGS];
+              break;
+            default:
+              // The memory has been zero'ed out already by Tuple::Create().
+              break;
+          }
+        }
+        if (is_null) {
+          row->SetTuple(tuple_idx, nullptr);
+        } else {
+          row->SetTuple(tuple_idx, tuple);
+        }
+      }
+      batch->CommitLastRow();
+    }
+    return batch;
+  }
+
+  virtual RowBatch* CreateIntBatch(int offset, int num_rows, bool gen_null) {
+    return CreateBatch(int_desc_, offset, num_rows, gen_null);
+  }
+
+  virtual RowBatch* CreateStringBatch(int offset, int num_rows, bool gen_null) {
+    return CreateBatch(string_desc_, offset, num_rows, gen_null);
+  }
+
+  void AppendValue(uint8_t* ptr, vector<int>* results) {
+    if (ptr == nullptr) {
+      // For the tests indicate null-ability using the max int value
+      results->push_back(numeric_limits<int>::max());
+    } else {
+      results->push_back(*reinterpret_cast<int*>(ptr));
+    }
+  }
+
+  void AppendValue(uint8_t* ptr, vector<StringValue>* results) {
+    if (ptr == nullptr) {
+      results->push_back(StringValue());
+    } else {
+      StringValue sv = *reinterpret_cast<StringValue*>(ptr);
+      uint8_t* copy = mem_pool_->Allocate(sv.len);
+      memcpy(copy, sv.ptr, sv.len);
+      sv.ptr = reinterpret_cast<char*>(copy);
+      results->push_back(sv);
+    }
+  }
+
+  template <typename T>
+  void AppendRowTuples(TupleRow* row, RowDescriptor* row_desc, vector<T>* results) {
+    DCHECK(row != nullptr);
+    const int num_tuples = row_desc->tuple_descriptors().size();
+
+    for (int tuple_idx = 0; tuple_idx < num_tuples; ++tuple_idx) {
+      TupleDescriptor* tuple_desc = row_desc->tuple_descriptors()[tuple_idx];
+      Tuple* tuple = row->GetTuple(tuple_idx);
+      const int num_slots = tuple_desc->slots().size();
+      for (int slot_idx = 0; slot_idx < num_slots; ++slot_idx) {
+        SlotDescriptor* slot_desc = tuple_desc->slots()[slot_idx];
+        if (tuple == nullptr) {
+          AppendValue(nullptr, results);
+        } else {
+          void* slot = tuple->GetSlot(slot_desc->tuple_offset());
+          AppendValue(reinterpret_cast<uint8_t*>(slot), results);
+        }
+      }
+    }
+  }
+
+  template <typename T>
+  void ReadValues(BufferedTupleStream* stream, RowDescriptor* desc, vector<T>* results,
+      int num_batches = -1) {
+    bool eos = false;
+    RowBatch batch(desc, BATCH_SIZE, &tracker_);
+    int batches_read = 0;
+    do {
+      batch.Reset();
+      EXPECT_OK(stream->GetNext(&batch, &eos));
+      ++batches_read;
+      for (int i = 0; i < batch.num_rows(); ++i) {
+        AppendRowTuples(batch.GetRow(i), desc, results);
+      }
+    } while (!eos && (num_batches < 0 || batches_read <= num_batches));
+  }
+
+  void GetExpectedValue(int idx, bool is_null, int* val) {
+    if (is_null) {
+      *val = numeric_limits<int>::max();
+    } else {
+      *val = GenIntValue(idx);
+    }
+  }
+
+  void GetExpectedValue(int idx, bool is_null, StringValue* val) {
+    if (is_null) {
+      *val = StringValue();
+    } else {
+      *val = STRINGS[idx % NUM_STRINGS];
+    }
+  }
+
+  template <typename T>
+  void VerifyResults(const RowDescriptor& row_desc, const vector<T>& results,
+      int num_rows, bool gen_null) {
+    int idx = 0;
+    for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
+      const int num_tuples = row_desc.tuple_descriptors().size();
+      for (int tuple_idx = 0; tuple_idx < num_tuples; ++tuple_idx) {
+        const TupleDescriptor* tuple_desc = row_desc.tuple_descriptors()[tuple_idx];
+        const int num_slots = tuple_desc->slots().size();
+        bool is_null = gen_null && !GenBoolValue(idx);
+        for (int slot_idx = 0; slot_idx < num_slots; ++slot_idx, ++idx) {
+          T expected_val;
+          GetExpectedValue(idx, is_null, &expected_val);
+          ASSERT_EQ(results[idx], expected_val)
+              << "results[" << idx << "] " << results[idx] << " != " << expected_val
+              << " row_idx=" << row_idx << " tuple_idx=" << tuple_idx
+              << " slot_idx=" << slot_idx << " gen_null=" << gen_null;
+        }
+      }
+    }
+    DCHECK_EQ(results.size(), idx);
+  }
+
+  // Test adding num_batches of ints to the stream and reading them back.
+  // If unpin_stream is true, operate the stream in unpinned mode.
+  // Assumes that enough buffers are available to read and write the stream.
+  template <typename T>
+  void TestValues(int num_batches, RowDescriptor* desc, bool gen_null, bool unpin_stream,
+      int64_t default_page_len = PAGE_LEN, int64_t max_page_len = -1,
+      int num_rows = BATCH_SIZE) {
+    if (max_page_len == -1) max_page_len = default_page_len;
+
+    BufferedTupleStream stream(
+        runtime_state_, desc, &client_, default_page_len, max_page_len);
+    ASSERT_OK(stream.Init(-1, true));
+    bool got_write_reservation;
+    ASSERT_OK(stream.PrepareForWrite(&got_write_reservation));
+    ASSERT_TRUE(got_write_reservation);
+
+    if (unpin_stream) {
+      stream.UnpinStream(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT);
+    }
+    // Add rows to the stream
+    int offset = 0;
+    for (int i = 0; i < num_batches; ++i) {
+      RowBatch* batch = nullptr;
+
+      Status status;
+      ASSERT_TRUE(sizeof(T) == sizeof(int) || sizeof(T) == sizeof(StringValue));
+      batch = CreateBatch(desc, offset, num_rows, gen_null);
+      for (int j = 0; j < batch->num_rows(); ++j) {
+        // TODO: test that AddRow succeeds after freeing memory.
+        bool b = stream.AddRow(batch->GetRow(j), &status);
+        ASSERT_OK(status);
+        ASSERT_TRUE(b);
+      }
+      offset += batch->num_rows();
+      // Reset the batch to make sure the stream handles the memory correctly.
+      batch->Reset();
+    }
+
+    bool got_read_reservation;
+    ASSERT_OK(stream.PrepareForRead(false, &got_read_reservation));
+    ASSERT_TRUE(got_read_reservation);
+
+    // Read all the rows back
+    vector<T> results;
+    ReadValues(&stream, desc, &results);
+
+    // Verify result
+    VerifyResults<T>(*desc, results, num_rows * num_batches, gen_null);
+
+    stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
+  }
+
+  void TestIntValuesInterleaved(int num_batches, int num_batches_before_read,
+      bool unpin_stream, int64_t page_len = PAGE_LEN) {
+    BufferedTupleStream stream(runtime_state_, int_desc_, &client_, page_len, page_len);
+    ASSERT_OK(stream.Init(-1, true));
+    bool got_reservation;
+    ASSERT_OK(stream.PrepareForReadWrite(true, &got_reservation));
+    ASSERT_TRUE(got_reservation);
+    if (unpin_stream) {
+      stream.UnpinStream(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT);
+    }
+
+    vector<int> results;
+    for (int i = 0; i < num_batches; ++i) {
+      RowBatch* batch = CreateIntBatch(i * BATCH_SIZE, BATCH_SIZE, false);
+      for (int j = 0; j < batch->num_rows(); ++j) {
+        Status status;
+        bool b = stream.AddRow(batch->GetRow(j), &status);
+        ASSERT_TRUE(b);
+        ASSERT_OK(status);
+      }
+      // Reset the batch to make sure the stream handles the memory correctly.
+      batch->Reset();
+      if (i % num_batches_before_read == 0) {
+        ReadValues(&stream, int_desc_, &results, (rand() % num_batches_before_read) + 1);
+      }
+    }
+    ReadValues(&stream, int_desc_, &results);
+
+    VerifyResults<int>(*int_desc_, results, BATCH_SIZE * num_batches, false);
+
+    stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
+  }
+
+  void TestUnpinPin(bool varlen_data, bool read_write);
+
+  void TestTransferMemory(bool pinned_stream, bool read_write);
+
+  // Helper to writes 'row' comprised of only string slots to 'data'. The expected
+  // length of the data written is 'expected_len'.
+  void WriteStringRow(const RowDescriptor* row_desc, TupleRow* row, int64_t fixed_size,
+      int64_t varlen_size, uint8_t* data);
+
+  // The temporary runtime environment used for the test.
+  scoped_ptr<TestEnv> test_env_;
+  RuntimeState* runtime_state_;
+  QueryState* query_state_;
+
+  // Buffer pool client - automatically deregistered in TearDown().
+  BufferPool::ClientHandle client_;
+
+  // Dummy MemTracker used for miscellaneous memory.
+  MemTracker tracker_;
+  ObjectPool pool_;
+  RowDescriptor* int_desc_;
+  RowDescriptor* string_desc_;
+
+  static const int64_t BIG_ROW_BYTES = 16 * 1024;
+  RowDescriptor* big_row_desc_;
+  RowDescriptor* nullable_big_row_desc_;
+  scoped_ptr<MemPool> mem_pool_;
+};
+
+// Tests with a non-NULLable tuple per row.
+class SimpleNullStreamTest : public SimpleTupleStreamTest {
+ protected:
+  virtual void CreateDescriptors() {
+    vector<bool> nullable_tuples(1, true);
+    vector<TTupleId> tuple_ids(1, static_cast<TTupleId>(0));
+
+    DescriptorTblBuilder int_builder(test_env_->exec_env()->frontend(), &pool_);
+    int_builder.DeclareTuple() << TYPE_INT;
+    int_desc_ =
+        pool_.Add(new RowDescriptor(*int_builder.Build(), tuple_ids, nullable_tuples));
+
+    DescriptorTblBuilder string_builder(test_env_->exec_env()->frontend(), &pool_);
+    string_builder.DeclareTuple() << TYPE_STRING;
+    string_desc_ =
+        pool_.Add(new RowDescriptor(*string_builder.Build(), tuple_ids, nullable_tuples));
+  }
+}; // SimpleNullStreamTest
+
+// Tests with multiple non-NULLable tuples per row.
+class MultiTupleStreamTest : public SimpleTupleStreamTest {
+ protected:
+  virtual void CreateDescriptors() {
+    vector<bool> nullable_tuples;
+    nullable_tuples.push_back(false);
+    nullable_tuples.push_back(false);
+    nullable_tuples.push_back(false);
+
+    vector<TTupleId> tuple_ids;
+    tuple_ids.push_back(static_cast<TTupleId>(0));
+    tuple_ids.push_back(static_cast<TTupleId>(1));
+    tuple_ids.push_back(static_cast<TTupleId>(2));
+
+    DescriptorTblBuilder int_builder(test_env_->exec_env()->frontend(), &pool_);
+    int_builder.DeclareTuple() << TYPE_INT;
+    int_builder.DeclareTuple() << TYPE_INT;
+    int_builder.DeclareTuple() << TYPE_INT;
+    int_desc_ =
+        pool_.Add(new RowDescriptor(*int_builder.Build(), tuple_ids, nullable_tuples));
+
+    DescriptorTblBuilder string_builder(test_env_->exec_env()->frontend(), &pool_);
+    string_builder.DeclareTuple() << TYPE_STRING;
+    string_builder.DeclareTuple() << TYPE_STRING;
+    string_builder.DeclareTuple() << TYPE_STRING;
+    string_desc_ =
+        pool_.Add(new RowDescriptor(*string_builder.Build(), tuple_ids, nullable_tuples));
+  }
+};
+
+// Tests with multiple NULLable tuples per row.
+class MultiNullableTupleStreamTest : public SimpleTupleStreamTest {
+ protected:
+  virtual void CreateDescriptors() {
+    vector<bool> nullable_tuples;
+    nullable_tuples.push_back(false);
+    nullable_tuples.push_back(true);
+    nullable_tuples.push_back(true);
+
+    vector<TTupleId> tuple_ids;
+    tuple_ids.push_back(static_cast<TTupleId>(0));
+    tuple_ids.push_back(static_cast<TTupleId>(1));
+    tuple_ids.push_back(static_cast<TTupleId>(2));
+
+    DescriptorTblBuilder int_builder(test_env_->exec_env()->frontend(), &pool_);
+    int_builder.DeclareTuple() << TYPE_INT;
+    int_builder.DeclareTuple() << TYPE_INT;
+    int_builder.DeclareTuple() << TYPE_INT;
+    int_desc_ =
+        pool_.Add(new RowDescriptor(*int_builder.Build(), tuple_ids, nullable_tuples));
+
+    DescriptorTblBuilder string_builder(test_env_->exec_env()->frontend(), &pool_);
+    string_builder.DeclareTuple() << TYPE_STRING;
+    string_builder.DeclareTuple() << TYPE_STRING;
+    string_builder.DeclareTuple() << TYPE_STRING;
+    string_desc_ =
+        pool_.Add(new RowDescriptor(*string_builder.Build(), tuple_ids, nullable_tuples));
+  }
+};
+
+/// Tests with collection types.
+class ArrayTupleStreamTest : public SimpleTupleStreamTest {
+ protected:
+  RowDescriptor* array_desc_;
+
+  virtual void CreateDescriptors() {
+    // tuples: (array<string>, array<array<int>>) (array<int>)
+    vector<bool> nullable_tuples(2, true);
+    vector<TTupleId> tuple_ids;
+    tuple_ids.push_back(static_cast<TTupleId>(0));
+    tuple_ids.push_back(static_cast<TTupleId>(1));
+    ColumnType string_array_type;
+    string_array_type.type = TYPE_ARRAY;
+    string_array_type.children.push_back(TYPE_STRING);
+
+    ColumnType int_array_type;
+    int_array_type.type = TYPE_ARRAY;
+    int_array_type.children.push_back(TYPE_STRING);
+
+    ColumnType nested_array_type;
+    nested_array_type.type = TYPE_ARRAY;
+    nested_array_type.children.push_back(int_array_type);
+
+    DescriptorTblBuilder builder(test_env_->exec_env()->frontend(), &pool_);
+    builder.DeclareTuple() << string_array_type << nested_array_type;
+    builder.DeclareTuple() << int_array_type;
+    array_desc_ =
+        pool_.Add(new RowDescriptor(*builder.Build(), tuple_ids, nullable_tuples));
+  }
+};
+
+// Basic API test. No data should be going to disk.
+TEST_F(SimpleTupleStreamTest, Basic) {
+  Init(numeric_limits<int64_t>::max());
+  TestValues<int>(0, int_desc_, false, true);
+  TestValues<int>(1, int_desc_, false, true);
+  TestValues<int>(10, int_desc_, false, true);
+  TestValues<int>(100, int_desc_, false, true);
+  TestValues<int>(0, int_desc_, false, false);
+  TestValues<int>(1, int_desc_, false, false);
+  TestValues<int>(10, int_desc_, false, false);
+  TestValues<int>(100, int_desc_, false, false);
+
+  TestValues<StringValue>(0, string_desc_, false, true);
+  TestValues<StringValue>(1, string_desc_, false, true);
+  TestValues<StringValue>(10, string_desc_, false, true);
+  TestValues<StringValue>(100, string_desc_, false, true);
+  TestValues<StringValue>(0, string_desc_, false, false);
+  TestValues<StringValue>(1, string_desc_, false, false);
+  TestValues<StringValue>(10, string_desc_, false, false);
+  TestValues<StringValue>(100, string_desc_, false, false);
+
+  TestIntValuesInterleaved(0, 1, true);
+  TestIntValuesInterleaved(1, 1, true);
+  TestIntValuesInterleaved(10, 5, true);
+  TestIntValuesInterleaved(100, 15, true);
+  TestIntValuesInterleaved(0, 1, false);
+  TestIntValuesInterleaved(1, 1, false);
+  TestIntValuesInterleaved(10, 5, false);
+  TestIntValuesInterleaved(100, 15, false);
+}
+
+// Test with only 1 buffer.
+TEST_F(SimpleTupleStreamTest, OneBufferSpill) {
+  // Each buffer can only hold 128 ints, so this spills quite often.
+  int buffer_size = 128 * sizeof(int);
+  Init(buffer_size);
+  TestValues<int>(0, int_desc_, false, true, buffer_size);
+  TestValues<int>(1, int_desc_, false, true, buffer_size);
+  TestValues<int>(10, int_desc_, false, true, buffer_size);
+
+  TestValues<StringValue>(0, string_desc_, false, true, buffer_size);
+  TestValues<StringValue>(1, string_desc_, false, true, buffer_size);
+  TestValues<StringValue>(10, string_desc_, false, true, buffer_size);
+}
+
+// Test with a few buffers.
+TEST_F(SimpleTupleStreamTest, ManyBufferSpill) {
+  int buffer_size = 128 * sizeof(int);
+  Init(10 * buffer_size);
+
+  TestValues<int>(0, int_desc_, false, true, buffer_size);
+  TestValues<int>(1, int_desc_, false, true, buffer_size);
+  TestValues<int>(10, int_desc_, false, true, buffer_size);
+  TestValues<int>(100, int_desc_, false, true, buffer_size);
+  TestValues<StringValue>(0, string_desc_, false, true, buffer_size);
+  TestValues<StringValue>(1, string_desc_, false, true, buffer_size);
+  TestValues<StringValue>(10, string_desc_, false, true, buffer_size);
+  TestValues<StringValue>(100, string_desc_, false, true, buffer_size);
+
+  TestIntValuesInterleaved(0, 1, true, buffer_size);
+  TestIntValuesInterleaved(1, 1, true, buffer_size);
+  TestIntValuesInterleaved(10, 5, true, buffer_size);
+  TestIntValuesInterleaved(100, 15, true, buffer_size);
+}
+
+void SimpleTupleStreamTest::TestUnpinPin(bool varlen_data, bool read_write) {
+  int buffer_size = 128 * sizeof(int);
+  int num_buffers = 10;
+  Init(num_buffers * buffer_size);
+  RowDescriptor* row_desc = varlen_data ? string_desc_ : int_desc_;
+
+  BufferedTupleStream stream(
+      runtime_state_, row_desc, &client_, buffer_size, buffer_size);
+  ASSERT_OK(stream.Init(-1, true));
+  if (read_write) {
+    bool got_reservation = false;
+    ASSERT_OK(stream.PrepareForReadWrite(false, &got_reservation));
+    ASSERT_TRUE(got_reservation);
+  } else {
+    bool got_write_reservation;
+    ASSERT_OK(stream.PrepareForWrite(&got_write_reservation));
+    ASSERT_TRUE(got_write_reservation);
+  }
+
+  int offset = 0;
+  bool full = false;
+  int num_batches = 0;
+  while (!full) {
+    // Make sure we can switch between pinned and unpinned states while writing.
+    if (num_batches % 10 == 0) {
+      bool pinned;
+      stream.UnpinStream(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT);
+      ASSERT_OK(stream.PinStream(&pinned));
+      DCHECK(pinned);
+    }
+
+    RowBatch* batch = varlen_data ? CreateStringBatch(offset, BATCH_SIZE, false) :
+                                    CreateIntBatch(offset, BATCH_SIZE, false);
+    int j = 0;
+    for (; j < batch->num_rows(); ++j) {
+      Status status;
+      full = !stream.AddRow(batch->GetRow(j), &status);
+      ASSERT_OK(status);
+      if (full) break;
+    }
+    offset += j;
+    ++num_batches;
+  }
+
+  stream.UnpinStream(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT);
+
+  bool pinned = false;
+  ASSERT_OK(stream.PinStream(&pinned));
+  ASSERT_TRUE(pinned);
+
+  // Read and verify result a few times. We should be able to reread the stream if
+  // we don't use delete on read mode.
+  int read_iters = 3;
+  for (int i = 0; i < read_iters; ++i) {
+    bool delete_on_read = i == read_iters - 1;
+    if (i > 0 || !read_write) {
+      bool got_read_reservation;
+      ASSERT_OK(stream.PrepareForRead(delete_on_read, &got_read_reservation));
+      ASSERT_TRUE(got_read_reservation);
+    }
+
+    if (varlen_data) {
+      vector<StringValue> results;
+      ReadValues(&stream, row_desc, &results);
+      VerifyResults<StringValue>(*string_desc_, results, offset, false);
+    } else {
+      vector<int> results;
+      ReadValues(&stream, row_desc, &results);
+      VerifyResults<int>(*int_desc_, results, offset, false);
+    }
+  }
+
+  // After delete_on_read, all blocks aside from the last should be deleted.
+  // Note: this should really be 0, but the BufferedTupleStream returns eos before
+  // deleting the last block, rather than after, so the last block isn't deleted
+  // until the stream is closed.
+  ASSERT_EQ(stream.BytesPinned(false), buffer_size);
+
+  stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
+
+  ASSERT_EQ(stream.BytesPinned(false), 0);
+}
+
+TEST_F(SimpleTupleStreamTest, UnpinPin) {
+  TestUnpinPin(false, false);
+}
+
+TEST_F(SimpleTupleStreamTest, UnpinPinReadWrite) {
+  TestUnpinPin(false, true);
+}
+
+TEST_F(SimpleTupleStreamTest, UnpinPinVarlen) {
+  TestUnpinPin(false, false);
+}
+
+void SimpleTupleStreamTest::TestTransferMemory(bool pin_stream, bool read_write) {
+  // Use smaller buffers so that the explicit FLUSH_RESOURCES flag is required to
+  // make the batch at capacity.
+  int buffer_size = 4 * 1024;
+  Init(100 * buffer_size);
+
+  BufferedTupleStream stream(
+      runtime_state_, int_desc_, &client_, buffer_size, buffer_size);
+  ASSERT_OK(stream.Init(-1, pin_stream));
+  if (read_write) {
+    bool got_reservation;
+    ASSERT_OK(stream.PrepareForReadWrite(true, &got_reservation));
+    ASSERT_TRUE(got_reservation);
+  } else {
+    bool got_write_reservation;
+    ASSERT_OK(stream.PrepareForWrite(&got_write_reservation));
+    ASSERT_TRUE(got_write_reservation);
+  }
+  RowBatch* batch = CreateIntBatch(0, 1024, false);
+
+  // Construct a stream with 4 pages.
+  const int total_num_pages = 4;
+  while (stream.byte_size() < total_num_pages * buffer_size) {
+    Status status;
+    for (int i = 0; i < batch->num_rows(); ++i) {
+      bool ret = stream.AddRow(batch->GetRow(i), &status);
+      EXPECT_TRUE(ret);
+      ASSERT_OK(status);
+    }
+  }
+
+  batch->Reset();
+
+  if (read_write) {
+    // Read back batch so that we have a read buffer in memory.
+    bool eos;
+    ASSERT_OK(stream.GetNext(batch, &eos));
+    EXPECT_FALSE(eos);
+  }
+  stream.Close(batch, RowBatch::FlushMode::FLUSH_RESOURCES);
+  if (pin_stream) {
+    EXPECT_EQ(total_num_pages, batch->num_buffers());
+  } else if (read_write) {
+    // Read and write buffer should be attached.
+    EXPECT_EQ(2, batch->num_buffers());
+  } else {
+    // Read buffer should be attached.
+    EXPECT_EQ(1, batch->num_buffers());
+  }
+  EXPECT_TRUE(batch->AtCapacity()); // Flush resources flag should have been set.
+  batch->Reset();
+  EXPECT_EQ(0, batch->num_buffers());
+}
+
+/// Test attaching memory to a row batch from a pinned stream.
+TEST_F(SimpleTupleStreamTest, TransferMemoryFromPinnedStreamReadWrite) {
+  TestTransferMemory(true, true);
+}
+
+TEST_F(SimpleTupleStreamTest, TransferMemoryFromPinnedStreamNoReadWrite) {
+  TestTransferMemory(true, false);
+}
+
+/// Test attaching memory to a row batch from an unpinned stream.
+TEST_F(SimpleTupleStreamTest, TransferMemoryFromUnpinnedStreamReadWrite) {
+  TestTransferMemory(false, true);
+}
+
+TEST_F(SimpleTupleStreamTest, TransferMemoryFromUnpinnedStreamNoReadWrite) {
+  TestTransferMemory(false, false);
+}
+
+// Test that tuple stream functions if it references strings outside stream. The
+// aggregation node relies on this since it updates tuples in-place.
+TEST_F(SimpleTupleStreamTest, StringsOutsideStream) {
+  int buffer_size = 8 * 1024 * 1024;
+  Init(2 * buffer_size);
+  Status status = Status::OK();
+
+  int num_batches = 100;
+  int rows_added = 0;
+  DCHECK_EQ(string_desc_->tuple_descriptors().size(), 1);
+  TupleDescriptor& tuple_desc = *string_desc_->tuple_descriptors()[0];
+
+  set<SlotId> external_slots;
+  for (int i = 0; i < tuple_desc.string_slots().size(); ++i) {
+    external_slots.insert(tuple_desc.string_slots()[i]->id());
+  }
+
+  BufferedTupleStream stream(
+      runtime_state_, string_desc_, &client_, buffer_size, buffer_size, external_slots);
+  ASSERT_OK(stream.Init(0, false));
+  bool got_reservation;
+  ASSERT_OK(stream.PrepareForWrite(&got_reservation));
+  ASSERT_TRUE(got_reservation);
+
+  for (int i = 0; i < num_batches; ++i) {
+    RowBatch* batch = CreateStringBatch(rows_added, BATCH_SIZE, false);
+    for (int j = 0; j < batch->num_rows(); ++j) {
+      int fixed_size = tuple_desc.byte_size();
+      // Copy fixed portion in, but leave it pointing to row batch's varlen data.
+      uint8_t* tuple_data = stream.AddRowCustomBegin(fixed_size, &status);
+      ASSERT_TRUE(tuple_data != nullptr);
+      ASSERT_TRUE(status.ok());
+      memcpy(tuple_data, batch->GetRow(j)->GetTuple(0), fixed_size);
+      stream.AddRowCustomEnd(fixed_size);
+    }
+    rows_added += batch->num_rows();
+  }
+
+  DCHECK_EQ(rows_added, stream.num_rows());
+
+  for (int delete_on_read = 0; delete_on_read <= 1; ++delete_on_read) {
+    // Keep stream in memory and test we can read ok.
+    vector<StringValue> results;
+    bool got_read_reservation;
+    ASSERT_OK(stream.PrepareForRead(delete_on_read, &got_read_reservation));
+    ASSERT_TRUE(got_read_reservation);
+    ReadValues(&stream, string_desc_, &results);
+    VerifyResults<StringValue>(*string_desc_, results, rows_added, false);
+  }
+
+  stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
+}
+
+// Construct a big row by stiching together many tuples so the total row size
+// will be close to the IO block size. With null indicators, stream will fail to
+// be initialized; Without null indicators, things should work fine.
+TEST_F(SimpleTupleStreamTest, BigRow) {
+  const int64_t MAX_BUFFERS = 10;
+  Init(MAX_BUFFERS * BIG_ROW_BYTES);
+
+  // Test writing this row into the stream and then reading it back.
+  // Make sure to exercise the case where the row is larger than the default page.
+  // If the stream is pinned, we can only fit MAX_BUFFERS - 1 rows (since we always
+  // advance to the next page). In the unpinned case we should be able to write
+  // arbitrarily many rows.
+  TestValues<int>(1, big_row_desc_, false, false, BIG_ROW_BYTES, BIG_ROW_BYTES, 1);
+  TestValues<int>(
+      MAX_BUFFERS - 1, big_row_desc_, false, false, BIG_ROW_BYTES, BIG_ROW_BYTES, 1);
+  TestValues<int>(1, big_row_desc_, false, false, BIG_ROW_BYTES / 4, BIG_ROW_BYTES, 1);
+  TestValues<int>(
+      MAX_BUFFERS - 1, big_row_desc_, false, false, BIG_ROW_BYTES / 4, BIG_ROW_BYTES, 1);
+  TestValues<int>(1, big_row_desc_, false, true, BIG_ROW_BYTES, BIG_ROW_BYTES, 1);
+  TestValues<int>(
+      MAX_BUFFERS - 1, big_row_desc_, false, true, BIG_ROW_BYTES, BIG_ROW_BYTES, 1);
+  TestValues<int>(
+      5 * MAX_BUFFERS, big_row_desc_, false, true, BIG_ROW_BYTES, BIG_ROW_BYTES, 1);
+  TestValues<int>(1, big_row_desc_, false, true, BIG_ROW_BYTES / 4, BIG_ROW_BYTES, 1);
+  TestValues<int>(
+      MAX_BUFFERS - 1, big_row_desc_, false, true, BIG_ROW_BYTES / 4, BIG_ROW_BYTES, 1);
+  TestValues<int>(
+      5 * MAX_BUFFERS, big_row_desc_, false, true, BIG_ROW_BYTES / 4, BIG_ROW_BYTES, 1);
+
+  // Test the case where it fits in an in-between page size.
+  TestValues<int>(MAX_BUFFERS - 1, big_row_desc_, false, false, BIG_ROW_BYTES / 4,
+      BIG_ROW_BYTES * 2, 1);
+  TestValues<int>(MAX_BUFFERS - 1, big_row_desc_, false, true, BIG_ROW_BYTES / 4,
+      BIG_ROW_BYTES * 2, 1);
+
+  // Construct a big row with nullable tuples. This requires extra space for null
+  // indicators in the stream so adding the row will fail.
+  ASSERT_TRUE(nullable_big_row_desc_->IsAnyTupleNullable());
+  BufferedTupleStream nullable_stream(
+      runtime_state_, nullable_big_row_desc_, &client_, BIG_ROW_BYTES, BIG_ROW_BYTES);
+  ASSERT_OK(nullable_stream.Init(-1, true));
+  bool got_reservation;
+  ASSERT_OK(nullable_stream.PrepareForWrite(&got_reservation));
+
+  // With null tuples, a row can fit in the stream.
+  RowBatch* batch = CreateBatch(nullable_big_row_desc_, 0, 1, true);
+  Status status;
+  EXPECT_TRUE(nullable_stream.AddRow(batch->GetRow(0), &status));
+  // With the additional null indicator, we can't fit all the tuples of a row into
+  // the stream.
+  batch = CreateBatch(nullable_big_row_desc_, 0, 1, false);
+  EXPECT_FALSE(nullable_stream.AddRow(batch->GetRow(0), &status));
+  EXPECT_EQ(TErrorCode::MAX_ROW_SIZE, status.code());
+  nullable_stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
+}
+
+// Test the memory use for large rows.
+TEST_F(SimpleTupleStreamTest, BigRowMemoryUse) {
+  const int64_t MAX_BUFFERS = 10;
+  const int64_t DEFAULT_PAGE_LEN = BIG_ROW_BYTES / 4;
+  Init(MAX_BUFFERS * BIG_ROW_BYTES);
+  Status status;
+  BufferedTupleStream stream(
+      runtime_state_, big_row_desc_, &client_, DEFAULT_PAGE_LEN, BIG_ROW_BYTES * 2);
+  ASSERT_OK(stream.Init(-1, true));
+  RowBatch* batch;
+  bool got_reservation;
+  ASSERT_OK(stream.PrepareForWrite(&got_reservation));
+  ASSERT_TRUE(got_reservation);
+  // We should be able to append MAX_BUFFERS without problem.
+  for (int i = 0; i < MAX_BUFFERS; ++i) {
+    batch = CreateBatch(big_row_desc_, i, 1, false);
+    bool success = stream.AddRow(batch->GetRow(0), &status);
+    ASSERT_TRUE(success);
+    // We should have one large page per row.
+    EXPECT_EQ(BIG_ROW_BYTES * (i + 1), client_.GetUsedReservation())
+        << i << ": " << client_.DebugString();
+  }
+
+  // We can't fit another row in memory - need to unpin to make progress.
+  batch = CreateBatch(big_row_desc_, MAX_BUFFERS, 1, false);
+  bool success = stream.AddRow(batch->GetRow(0), &status);
+  ASSERT_FALSE(success);
+  ASSERT_OK(status);
+  stream.UnpinStream(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT);
+  success = stream.AddRow(batch->GetRow(0), &status);
+  ASSERT_TRUE(success);
+  // Read all the rows back and verify.
+  ASSERT_OK(stream.PrepareForRead(false, &got_reservation));
+  ASSERT_TRUE(got_reservation);
+  vector<int> results;
+  ReadValues(&stream, big_row_desc_, &results);
+  VerifyResults<int>(*big_row_desc_, results, MAX_BUFFERS + 1, false);
+  stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
+}
+
+// Test for IMPALA-3923: overflow of 32-bit int in GetRows().
+TEST_F(SimpleTupleStreamTest, TestGetRowsOverflow) {
+  Init(BUFFER_POOL_LIMIT);
+  BufferedTupleStream stream(runtime_state_, int_desc_, &client_, PAGE_LEN, PAGE_LEN);
+  ASSERT_OK(stream.Init(-1, true));
+
+  Status status;
+  // Add more rows than can be fit in a RowBatch (limited by its 32-bit row count).
+  // Actually adding the rows would take a very long time, so just set num_rows_.
+  // This puts the stream in an inconsistent state, but exercises the right code path.
+  stream.num_rows_ = 1L << 33;
+  bool got_rows;
+  scoped_ptr<RowBatch> overflow_batch;
+  ASSERT_FALSE(stream.GetRows(&tracker_, &overflow_batch, &got_rows).ok());
+  stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
+}
+
+// Test rows greater than the default page size. Also exercise the read/write
+// mode with large pages.
+TEST_F(SimpleTupleStreamTest, BigStringReadWrite) {
+  const int64_t MAX_BUFFERS = 10;
+  const int64_t DEFAULT_PAGE_LEN = BIG_ROW_BYTES / 4;
+  Init(MAX_BUFFERS * BIG_ROW_BYTES);
+  Status status;
+  BufferedTupleStream stream(
+      runtime_state_, string_desc_, &client_, DEFAULT_PAGE_LEN, BIG_ROW_BYTES * 2);
+  ASSERT_OK(stream.Init(-1, true));
+  RowBatch write_batch(string_desc_, 1024, &tracker_);
+  RowBatch read_batch(string_desc_, 1024, &tracker_);
+  bool got_reservation;
+  ASSERT_OK(stream.PrepareForReadWrite(false, &got_reservation));
+  ASSERT_TRUE(got_reservation);
+  TupleRow* write_row = write_batch.GetRow(0);
+  TupleDescriptor* tuple_desc = string_desc_->tuple_descriptors()[0];
+  vector<uint8_t> tuple_mem(tuple_desc->byte_size());
+  Tuple* write_tuple = reinterpret_cast<Tuple*>(tuple_mem.data());
+  write_row->SetTuple(0, write_tuple);
+  StringValue* write_str = reinterpret_cast<StringValue*>(
+      write_tuple->GetSlot(tuple_desc->slots()[0]->tuple_offset()));
+  // Make the string large enough to fill a page.
+  const int64_t string_len = BIG_ROW_BYTES - tuple_desc->byte_size();
+  vector<char> data(string_len);
+  write_str->len = string_len;
+  write_str->ptr = data.data();
+
+  // We should be able to append MAX_BUFFERS without problem.
+  for (int i = 0; i < MAX_BUFFERS; ++i) {
+    // Fill the string with the value i.
+    memset(write_str->ptr, i, write_str->len);
+    bool success = stream.AddRow(write_row, &status);
+    ASSERT_TRUE(success);
+    // We should have one large page per row, plus a default-size read/write page, plus
+    // we waste the first default-size page in the stream by leaving it empty.
+    EXPECT_EQ(BIG_ROW_BYTES * (i + 1), client_.GetUsedReservation())
+        << i << ": " << client_.DebugString() << "\n"
+        << stream.DebugString();
+
+    // Read back the rows as we write them to test read/write mode.
+    read_batch.Reset();
+    bool eos;
+    ASSERT_OK(stream.GetNext(&read_batch, &eos));
+    EXPECT_EQ(1, read_batch.num_rows());
+    EXPECT_TRUE(eos);
+    Tuple* tuple = read_batch.GetRow(0)->GetTuple(0);
+    StringValue* str = reinterpret_cast<StringValue*>(
+        tuple->GetSlot(tuple_desc->slots()[0]->tuple_offset()));
+    EXPECT_EQ(string_len, str->len);
+    for (int j = 0; j < string_len; ++j) {
+      EXPECT_EQ(i, str->ptr[j]) << j;
+    }
+  }
+
+  // We can't fit another row in memory - need to unpin to make progress.
+  memset(write_str->ptr, MAX_BUFFERS, write_str->len);
+  bool success = stream.AddRow(write_row, &status);
+  ASSERT_FALSE(success);
+  ASSERT_OK(status);
+  stream.UnpinStream(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT);
+  success = stream.AddRow(write_row, &status);
+  ASSERT_TRUE(success);
+
+  // Read all the rows back and verify.
+  ASSERT_OK(stream.PrepareForRead(false, &got_reservation));
+  ASSERT_TRUE(got_reservation);
+  for (int i = 0; i < MAX_BUFFERS + 1; ++i) {
+    read_batch.Reset();
+    bool eos;
+    ASSERT_OK(stream.GetNext(&read_batch, &eos));
+    EXPECT_EQ(1, read_batch.num_rows());
+    EXPECT_EQ(eos, i == MAX_BUFFERS) << i;
+    Tuple* tuple = read_batch.GetRow(0)->GetTuple(0);
+    StringValue* str = reinterpret_cast<StringValue*>(
+        tuple->GetSlot(tuple_desc->slots()[0]->tuple_offset()));
+    EXPECT_EQ(string_len, str->len);
+    for (int j = 0; j < string_len; ++j) {
+      ASSERT_EQ(i, str->ptr[j]) << j;
+    }
+  }
+  stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
+}
+
+// Basic API test. No data should be going to disk.
+TEST_F(SimpleNullStreamTest, Basic) {
+  Init(BUFFER_POOL_LIMIT);
+  TestValues<int>(0, int_desc_, false, true);
+  TestValues<int>(1, int_desc_, false, true);
+  TestValues<int>(10, int_desc_, false, true);
+  TestValues<int>(100, int_desc_, false, true);
+  TestValues<int>(0, int_desc_, true, true);
+  TestValues<int>(1, int_desc_, true, true);
+  TestValues<int>(10, int_desc_, true, true);
+  TestValues<int>(100, int_desc_, true, true);
+  TestValues<int>(0, int_desc_, false, false);
+  TestValues<int>(1, int_desc_, false, false);
+  TestValues<int>(10, int_desc_, false, false);
+  TestValues<int>(100, int_desc_, false, false);
+  TestValues<int>(0, int_desc_, true, false);
+  TestValues<int>(1, int_desc_, true, false);
+  TestValues<int>(10, int_desc_, true, false);
+  TestValues<int>(100, int_desc_, true, false);
+
+  TestValues<StringValue>(0, string_desc_, false, true);
+  TestValues<StringValue>(1, string_desc_, false, true);
+  TestValues<StringValue>(10, string_desc_, false, true);
+  TestValues<StringValue>(100, string_desc_, false, true);
+  TestValues<StringValue>(0, string_desc_, true, true);
+  TestValues<StringValue>(1, string_desc_, true, true);
+  TestValues<StringValue>(10, string_desc_, true, true);
+  TestValues<StringValue>(100, string_desc_, true, true);
+  TestValues<StringValue>(0, string_desc_, false, false);
+  TestValues<StringValue>(1, string_desc_, false, false);
+  TestValues<StringValue>(10, string_desc_, false, false);
+  TestValues<StringValue>(100, string_desc_, false, false);
+  TestValues<StringValue>(0, string_desc_, true, false);
+  TestValues<StringValue>(1, string_desc_, true, false);
+  TestValues<StringValue>(10, string_desc_, true, false);
+  TestValues<StringValue>(100, string_desc_, true, false);
+
+  TestIntValuesInterleaved(0, 1, true);
+  TestIntValuesInterleaved(1, 1, true);
+  TestIntValuesInterleaved(10, 5, true);
+  TestIntValuesInterleaved(100, 15, true);
+  TestIntValuesInterleaved(0, 1, false);
+  TestIntValuesInterleaved(1, 1, false);
+  TestIntValuesInterleaved(10, 5, false);
+  TestIntValuesInterleaved(100, 15, false);
+}
+
+// Test tuple stream with only 1 buffer and rows with multiple tuples.
+TEST_F(MultiTupleStreamTest, MultiTupleOneBufferSpill) {
+  // Each buffer can only hold 128 ints, so this spills quite often.
+  int buffer_size = 128 * sizeof(int);
+  Init(buffer_size);
+  TestValues<int>(0, int_desc_, false, true, buffer_size);
+  TestValues<int>(1, int_desc_, false, true, buffer_size);
+  TestValues<int>(10, int_desc_, false, true, buffer_size);
+
+  TestValues<StringValue>(0, string_desc_, false, true, buffer_size);
+  TestValues<StringValue>(1, string_desc_, false, true, buffer_size);
+  TestValues<StringValue>(10, string_desc_, false, true, buffer_size);
+}
+
+// Test with a few buffers and rows with multiple tuples.
+TEST_F(MultiTupleStreamTest, MultiTupleManyBufferSpill) {
+  int buffer_size = 128 * sizeof(int);
+  Init(10 * buffer_size);
+
+  TestValues<int>(0, int_desc_, false, true, buffer_size);
+  TestValues<int>(1, int_desc_, false, true, buffer_size);
+  TestValues<int>(10, int_desc_, false, true, buffer_size);
+  TestValues<int>(100, int_desc_, false, true, buffer_size);
+
+  TestValues<StringValue>(0, string_desc_, false, true, buffer_size);
+  TestValues<StringValue>(1, string_desc_, false, true, buffer_size);
+  TestValues<StringValue>(10, string_desc_, false, true, buffer_size);
+  TestValues<StringValue>(100, string_desc_, false, true, buffer_size);
+
+  TestIntValuesInterleaved(1, 1, true, buffer_size);
+  TestIntValuesInterleaved(10, 5, true, buffer_size);
+  TestIntValuesInterleaved(100, 15, true, buffer_size);
+}
+
+// Test that we can allocate a row in the stream and copy in multiple tuples then
+// read it back from the stream.
+TEST_F(MultiTupleStreamTest, MultiTupleAddRowCustom) {
+  // Use small buffers so it will be flushed to disk.
+  int buffer_size = 4 * 1024;
+  Init(2 * buffer_size);
+  Status status = Status::OK();
+
+  int num_batches = 1;
+  int rows_added = 0;
+  BufferedTupleStream stream(
+      runtime_state_, string_desc_, &client_, buffer_size, buffer_size);
+  ASSERT_OK(stream.Init(-1, false));
+  bool got_write_reservation;
+  ASSERT_OK(stream.PrepareForWrite(&got_write_reservation));
+  ASSERT_TRUE(got_write_reservation);
+
+  for (int i = 0; i < num_batches; ++i) {
+    RowBatch* batch = CreateStringBatch(rows_added, 1, false);
+    for (int j = 0; j < batch->num_rows(); ++j) {
+      TupleRow* row = batch->GetRow(j);
+      int64_t fixed_size = 0;
+      int64_t varlen_size = 0;
+      for (int k = 0; k < string_desc_->tuple_descriptors().size(); k++) {
+        TupleDescriptor* tuple_desc = string_desc_->tuple_descriptors()[k];
+        fixed_size += tuple_desc->byte_size();
+        varlen_size += row->GetTuple(k)->VarlenByteSize(*tuple_desc);
+      }
+      uint8_t* data = stream.AddRowCustomBegin(fixed_size + varlen_size, &status);
+      ASSERT_TRUE(data != nullptr);
+      ASSERT_TRUE(status.ok());
+      WriteStringRow(string_desc_, row, fixed_size, varlen_size, data);
+      stream.AddRowCustomEnd(fixed_size + varlen_size);
+    }
+    rows_added += batch->num_rows();
+  }
+
+  for (int i = 0; i < 3; ++i) {
+    bool delete_on_read = i == 2;
+    vector<StringValue> results;
+    bool got_read_reservation;
+    ASSERT_OK(stream.PrepareForRead(delete_on_read, &got_read_reservation));
+    ASSERT_TRUE(got_read_reservation);
+    ReadValues(&stream, string_desc_, &results);
+    VerifyResults<StringValue>(*string_desc_, results, rows_added, false);
+  }
+
+  stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
+}
+
+void SimpleTupleStreamTest::WriteStringRow(const RowDescriptor* row_desc, TupleRow* row,
+    int64_t fixed_size, int64_t varlen_size, uint8_t* data) {
+  uint8_t* fixed_data = data;
+  uint8_t* varlen_write_ptr = data + fixed_size;
+  for (int i = 0; i < row_desc->tuple_descriptors().size(); i++) {
+    TupleDescriptor* tuple_desc = row_desc->tuple_descriptors()[i];
+    Tuple* src = row->GetTuple(i);
+    Tuple* dst = reinterpret_cast<Tuple*>(fixed_data);
+    fixed_data += tuple_desc->byte_size();
+    memcpy(dst, src, tuple_desc->byte_size());
+    for (SlotDescriptor* slot : tuple_desc->slots()) {
+      StringValue* src_string = src->GetStringSlot(slot->tuple_offset());
+      StringValue* dst_string = dst->GetStringSlot(slot->tuple_offset());
+      dst_string->ptr = reinterpret_cast<char*>(varlen_write_ptr);
+      memcpy(dst_string->ptr, src_string->ptr, src_string->len);
+      varlen_write_ptr += src_string->len;
+    }
+  }
+  ASSERT_EQ(data + fixed_size + varlen_size, varlen_write_ptr);
+}
+
+// Test with rows with multiple nullable tuples.
+TEST_F(MultiNullableTupleStreamTest, MultiNullableTupleOneBufferSpill) {
+  // Each buffer can only hold 128 ints, so this spills quite often.
+  int buffer_size = 128 * sizeof(int);
+  Init(buffer_size);
+  TestValues<int>(0, int_desc_, false, true, buffer_size);
+  TestValues<int>(1, int_desc_, false, true, buffer_size);
+  TestValues<int>(10, int_desc_, false, true, buffer_size);
+  TestValues<int>(0, int_desc_, true, true, buffer_size);
+  TestValues<int>(1, int_desc_, true, true, buffer_size);
+  TestValues<int>(10, int_desc_, true, true, buffer_size);
+
+  TestValues<StringValue>(0, string_desc_, false, true, buffer_size);
+  TestValues<StringValue>(1, string_desc_, false, true, buffer_size);
+  TestValues<StringValue>(10, string_desc_, false, true, buffer_size);
+  TestValues<StringValue>(0, string_desc_, true, true, buffer_size);
+  TestValues<StringValue>(1, string_desc_, true, true, buffer_size);
+  TestValues<StringValue>(10, string_desc_, true, true, buffer_size);
+}
+
+// Test with a few buffers.
+TEST_F(MultiNullableTupleStreamTest, MultiNullableTupleManyBufferSpill) {
+  int buffer_size = 128 * sizeof(int);
+  Init(10 * buffer_size);
+
+  TestValues<int>(0, int_desc_, false, true, buffer_size);
+  TestValues<int>(1, int_desc_, false, true, buffer_size);
+  TestValues<int>(10, int_desc_, false, true, buffer_size);
+  TestValues<int>(100, int_desc_, false, true, buffer_size);
+  TestValues<int>(0, int_desc_, true, true, buffer_size);
+  TestValues<int>(1, int_desc_, true, true, buffer_size);
+  TestValues<int>(10, int_desc_, true, true, buffer_size);
+  TestValues<int>(100, int_desc_, true, true, buffer_size);
+
+  TestValues<StringValue>(0, string_desc_, false, true, buffer_size);
+  TestValues<StringValue>(1, string_desc_, false, true, buffer_size);
+  TestValues<StringValue>(10, string_desc_, false, true, buffer_size);
+  TestValues<StringValue>(100, string_desc_, false, true, buffer_size);
+  TestValues<StringValue>(0, string_desc_, true, true, buffer_size);
+  TestValues<StringValue>(1, string_desc_, true, true, buffer_size);
+  TestValues<StringValue>(10, string_desc_, true, true, buffer_size);
+  TestValues<StringValue>(100, string_desc_, true, true, buffer_size);
+
+  TestIntValuesInterleaved(0, 1, true, buffer_size);
+  TestIntValuesInterleaved(1, 1, true, buffer_size);
+  TestIntValuesInterleaved(10, 5, true, buffer_size);
+  TestIntValuesInterleaved(100, 15, true, buffer_size);
+}
+
+/// Test that ComputeRowSize handles nulls
+TEST_F(MultiNullableTupleStreamTest, TestComputeRowSize) {
+  Init(BUFFER_POOL_LIMIT);
+  const vector<TupleDescriptor*>& tuple_descs = string_desc_->tuple_descriptors();
+  // String in second tuple is stored externally.
+  set<SlotId> external_slots;
+  const SlotDescriptor* external_string_slot = tuple_descs[1]->slots()[0];
+  external_slots.insert(external_string_slot->id());
+
+  BufferedTupleStream stream(
+      runtime_state_, string_desc_, &client_, PAGE_LEN, PAGE_LEN, external_slots);
+  gscoped_ptr<TupleRow, FreeDeleter> row(
+      reinterpret_cast<TupleRow*>(malloc(tuple_descs.size() * sizeof(Tuple*))));
+  gscoped_ptr<Tuple, FreeDeleter> tuple0(
+      reinterpret_cast<Tuple*>(malloc(tuple_descs[0]->byte_size())));
+  gscoped_ptr<Tuple, FreeDeleter> tuple1(
+      reinterpret_cast<Tuple*>(malloc(tuple_descs[1]->byte_size())));
+  gscoped_ptr<Tuple, FreeDeleter> tuple2(
+      reinterpret_cast<Tuple*>(malloc(tuple_descs[2]->byte_size())));
+  memset(tuple0.get(), 0, tuple_descs[0]->byte_size());
+  memset(tuple1.get(), 0, tuple_descs[1]->byte_size());
+  memset(tuple2.get(), 0, tuple_descs[2]->byte_size());
+  const int tuple_null_indicator_bytes = 1; // Need 1 bytes for 3 tuples.
+
+  // All nullable tuples are NULL.
+  row->SetTuple(0, tuple0.get());
+  row->SetTuple(1, nullptr);
+  row->SetTuple(2, nullptr);
+  EXPECT_EQ(tuple_null_indicator_bytes + tuple_descs[0]->byte_size(),
+      stream.ComputeRowSize(row.get()));
+
+  // Tuples are initialized to empty and have no var-len data.
+  row->SetTuple(1, tuple1.get());
+  row->SetTuple(2, tuple2.get());
+  EXPECT_EQ(tuple_null_indicator_bytes + string_desc_->GetRowSize(),
+      stream.ComputeRowSize(row.get()));
+
+  // Tuple 0 has some data.
+  const SlotDescriptor* string_slot = tuple_descs[0]->slots()[0];
+  StringValue* sv = tuple0->GetStringSlot(string_slot->tuple_offset());
+  *sv = STRINGS[0];
+  int64_t expected_len =
+      tuple_null_indicator_bytes + string_desc_->GetRowSize() + sv->len;
+  EXPECT_EQ(expected_len, stream.ComputeRowSize(row.get()));
+
+  // Check that external slots aren't included in count.
+  sv = tuple1->GetStringSlot(external_string_slot->tuple_offset());
+  sv->ptr = reinterpret_cast<char*>(1234);
+  sv->len = 1234;
+  EXPECT_EQ(expected_len, stream.ComputeRowSize(row.get()));
+
+  stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
+}
+
+/// Test that deep copy works with arrays by copying into a BufferedTupleStream, freeing
+/// the original rows, then reading back the rows and verifying the contents.
+TEST_F(ArrayTupleStreamTest, TestArrayDeepCopy) {
+  Status status;
+  Init(BUFFER_POOL_LIMIT);
+  const int NUM_ROWS = 4000;
+  BufferedTupleStream stream(runtime_state_, array_desc_, &client_, PAGE_LEN, PAGE_LEN);
+  const vector<TupleDescriptor*>& tuple_descs = array_desc_->tuple_descriptors();
+  // Write out a predictable pattern of data by iterating over arrays of constants.
+  int strings_index = 0; // we take the mod of this as index into STRINGS.
+  int array_lens[] = {0, 1, 5, 10, 1000, 2, 49, 20};
+  int num_array_lens = sizeof(array_lens) / sizeof(array_lens[0]);
+  int array_len_index = 0;
+  ASSERT_OK(stream.Init(-1, false));
+  bool got_write_reservation;
+  ASSERT_OK(stream.PrepareForWrite(&got_write_reservation));
+  ASSERT_TRUE(got_write_reservation);
+
+  for (int i = 0; i < NUM_ROWS; ++i) {
+    const int tuple_null_indicator_bytes = 1; // Need 1 bytes for 2 tuples.
+    int expected_row_size = tuple_null_indicator_bytes + tuple_descs[0]->byte_size()
+        + tuple_descs[1]->byte_size();
+    gscoped_ptr<TupleRow, FreeDeleter> row(
+        reinterpret_cast<TupleRow*>(malloc(tuple_descs.size() * sizeof(Tuple*))));
+    gscoped_ptr<Tuple, FreeDeleter> tuple0(
+        reinterpret_cast<Tuple*>(malloc(tuple_descs[0]->byte_size())));
+    gscoped_ptr<Tuple, FreeDeleter> tuple1(
+        reinterpret_cast<Tuple*>(malloc(tuple_descs[1]->byte_size())));
+    memset(tuple0.get(), 0, tuple_descs[0]->byte_size());
+    memset(tuple1.get(), 0, tuple_descs[1]->byte_size());
+    row->SetTuple(0, tuple0.get());
+    row->SetTuple(1, tuple1.get());
+
+    // Only array<string> is non-null.
+    tuple0->SetNull(tuple_descs[0]->slots()[1]->null_indicator_offset());
+    tuple1->SetNull(tuple_descs[1]->slots()[0]->null_indicator_offset());
+    const SlotDescriptor* array_slot_desc = tuple_descs[0]->slots()[0];
+    const TupleDescriptor* item_desc = array_slot_desc->collection_item_descriptor();
+
+    int array_len = array_lens[array_len_index++ % num_array_lens];
+    CollectionValue* cv = tuple0->GetCollectionSlot(array_slot_desc->tuple_offset());
+    cv->ptr = nullptr;
+    cv->num_tuples = 0;
+    CollectionValueBuilder builder(
+        cv, *item_desc, mem_pool_.get(), runtime_state_, array_len);
+    Tuple* array_data;
+    int num_rows;
+    builder.GetFreeMemory(&array_data, &num_rows);
+    expected_row_size += item_desc->byte_size() * array_len;
+
+    // Fill the array with pointers to our constant strings.
+    for (int j = 0; j < array_len; ++j) {
+      const StringValue* string = &STRINGS[strings_index++ % NUM_STRINGS];
+      array_data->SetNotNull(item_desc->slots()[0]->null_indicator_offset());
+      RawValue::Write(string, array_data, item_desc->slots()[0], mem_pool_.get());
+      array_data += item_desc->byte_size();
+      expected_row_size += string->len;
+    }
+    builder.CommitTuples(array_len);
+
+    // Check that internal row size computation gives correct result.
+    EXPECT_EQ(expected_row_size, stream.ComputeRowSize(row.get()));
+    bool b = stream.AddRow(row.get(), &status);
+    ASSERT_TRUE(b);
+    ASSERT_OK(status);
+    mem_pool_->FreeAll(); // Free data as soon as possible to smoke out issues.
+  }
+
+  // Read back and verify data.
+  bool got_read_reservation;
+  ASSERT_OK(stream.PrepareForRead(false, &got_read_reservation));
+  ASSERT_TRUE(got_read_reservation);
+  strings_index = 0;
+  array_len_index = 0;
+  bool eos = false;
+  int rows_read = 0;
+  RowBatch batch(array_desc_, BATCH_SIZE, &tracker_);
+  do {
+    batch.Reset();
+    ASSERT_OK(stream.GetNext(&batch, &eos));
+    for (int i = 0; i < batch.num_rows(); ++i) {
+      TupleRow* row = batch.GetRow(i);
+      Tuple* tuple0 = row->GetTuple(0);
+      Tuple* tuple1 = row->GetTuple(1);
+      ASSERT_TRUE(tuple0 != nullptr);
+      ASSERT_TRUE(tuple1 != nullptr);
+      const SlotDescriptor* array_slot_desc = tuple_descs[0]->slots()[0];
+      ASSERT_FALSE(tuple0->IsNull(array_slot_desc->null_indicator_offset()));
+      ASSERT_TRUE(tuple0->IsNull(tuple_descs[0]->slots()[1]->null_indicator_offset()));
+      ASSERT_TRUE(tuple1->IsNull(tuple_descs[1]->slots()[0]->null_indicator_offset()));
+
+      const TupleDescriptor* item_desc = array_slot_desc->collection_item_descriptor();
+      int expected_array_len = array_lens[array_len_index++ % num_array_lens];
+      CollectionValue* cv = tuple0->GetCollectionSlot(array_slot_desc->tuple_offset());
+      ASSERT_EQ(expected_array_len, cv->num_tuples);
+      for (int j = 0; j < cv->num_tuples; ++j) {
+        Tuple* item = reinterpret_cast<Tuple*>(cv->ptr + j * item_desc->byte_size());
+        const SlotDescriptor* string_desc = item_desc->slots()[0];
+        ASSERT_FALSE(item->IsNull(string_desc->null_indicator_offset()));
+        const StringValue* expected = &STRINGS[strings_index++ % NUM_STRINGS];
+        const StringValue* actual = item->GetStringSlot(string_desc->tuple_offset());
+        ASSERT_EQ(*expected, *actual);
+      }
+    }
+    rows_read += batch.num_rows();
+  } while (!eos);
+  ASSERT_EQ(NUM_ROWS, rows_read);
+  stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
+}
+
+/// Test that ComputeRowSize handles nulls
+TEST_F(ArrayTupleStreamTest, TestComputeRowSize) {
+  Init(BUFFER_POOL_LIMIT);
+  const vector<TupleDescriptor*>& tuple_descs = array_desc_->tuple_descriptors();
+  set<SlotId> external_slots;
+  // Second array slot in first tuple is stored externally.
+  const SlotDescriptor* external_array_slot = tuple_descs[0]->slots()[1];
+  external_slots.insert(external_array_slot->id());
+
+  BufferedTupleStream stream(
+      runtime_state_, array_desc_, &client_, PAGE_LEN, PAGE_LEN, external_slots);
+  gscoped_ptr<TupleRow, FreeDeleter> row(
+      reinterpret_cast<TupleRow*>(malloc(tuple_descs.size() * sizeof(Tuple*))));
+  gscoped_ptr<Tuple, FreeDeleter> tuple0(
+      reinterpret_cast<Tuple*>(malloc(tuple_descs[0]->byte_size())));
+  gscoped_ptr<Tuple, FreeDeleter> tuple1(
+      reinterpret_cast<Tuple*>(malloc(tuple_descs[1]->byte_size())));
+  memset(tuple0.get(), 0, tuple_descs[0]->byte_size());
+  memset(tuple1.get(), 0, tuple_descs[1]->byte_size());
+
+  const int tuple_null_indicator_bytes = 1; // Need 1 bytes for 3 tuples.
+
+  // All tuples are NULL - only need null indicators.
+  row->SetTuple(0, nullptr);
+  row->SetTuple(1, nullptr);
+  EXPECT_EQ(tuple_null_indicator_bytes, stream.ComputeRowSize(row.get()));
+
+  // Tuples are initialized to empty and have no var-len data.
+  row->SetTuple(0, tuple0.get());
+  row->SetTuple(1, tuple1.get());
+  EXPECT_EQ(tuple_null_indicator_bytes + array_desc_->GetRowSize(),
+      stream.ComputeRowSize(row.get()));
+
+  // Tuple 0 has an array.
+  int expected_row_size = tuple_null_indicator_bytes + array_desc_->GetRowSize();
+  const SlotDescriptor* array_slot = tuple_descs[0]->slots()[0];
+  const TupleDescriptor* item_desc = array_slot->collection_item_descriptor();
+  int array_len = 128;
+  CollectionValue* cv = tuple0->GetCollectionSlot(array_slot->tuple_offset());
+  CollectionValueBuilder builder(
+      cv, *item_desc, mem_pool_.get(), runtime_state_, array_len);
+  Tuple* array_data;
+  int num_rows;
+  builder.GetFreeMemory(&array_data, &num_rows);
+  expected_row_size += item_desc->byte_size() * array_len;
+
+  // Fill the array with pointers to our constant strings.
+  for (int i = 0; i < array_len; ++i) {
+    const StringValue* str = &STRINGS[i % NUM_STRINGS];
+    array_data->SetNotNull(item_desc->slots()[0]->null_indicator_offset());
+    RawValue::Write(str, array_data, item_desc->slots()[0], mem_pool_.get());
+    array_data += item_desc->byte_size();
+    expected_row_size += str->len;
+  }
+  builder.CommitTuples(array_len);
+  EXPECT_EQ(expected_row_size, stream.ComputeRowSize(row.get()));
+
+  // Check that the external slot isn't included in size.
+  cv = tuple0->GetCollectionSlot(external_array_slot->tuple_offset());
+  // ptr of external slot shouldn't be dereferenced when computing size.
+  cv->ptr = reinterpret_cast<uint8_t*>(1234);
+  cv->num_tuples = 1234;
+  EXPECT_EQ(expected_row_size, stream.ComputeRowSize(row.get()));
+
+  // Check that the array is excluded if tuple 0's array has its null indicator set.
+  tuple0->SetNull(array_slot->null_indicator_offset());
+  EXPECT_EQ(tuple_null_indicator_bytes + array_desc_->GetRowSize(),
+      stream.ComputeRowSize(row.get()));
+
+  stream.Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
+}
+}
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  impala::InitCommonRuntime(argc, argv, true, impala::TestInfo::BE_TEST);
+  impala::InitFeSupport();
+  impala::LlvmCodeGen::InitializeLlvm();
+  return RUN_ALL_TESTS();
+}


[09/11] incubator-impala git commit: IMPALA-5572: Timestamp codegen for text scanner

Posted by ta...@apache.org.
IMPALA-5572: Timestamp codegen for text scanner

Currently codegen is disabled when scanning text tables with timestamp
columns. The message is "Timestamp not yet supported for codegen."
This patch adds support for timestamp codegen.
A simple query in the comment section of this issue performs a little
better (4%) than interpreted version.

Testing: The patch passed test with exhaustive workload exploration
strategy.

Change-Id: I00cbf8ec7784ca9594e14e952f46dc54a5ede44b
Reviewed-on: http://gerrit.cloudera.org:8080/7556
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/d61065d6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/d61065d6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/d61065d6

Branch: refs/heads/master
Commit: d61065d6383ece19fdafd2526ecf00dab4e1f4d4
Parents: 0c46147
Author: Tianyi Wang <tw...@cloudera.com>
Authored: Tue Aug 1 14:52:55 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Tue Aug 8 10:23:29 2017 +0000

----------------------------------------------------------------------
 be/src/codegen/gen_ir_descriptions.py |  1 +
 be/src/exec/hdfs-scanner-ir.cc        |  6 ++++++
 be/src/exec/hdfs-scanner.cc           |  3 ---
 be/src/exec/text-converter.cc         | 19 ++++++++++++++++---
 be/src/util/string-parser.h           | 11 +++++++++++
 5 files changed, 34 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d61065d6/be/src/codegen/gen_ir_descriptions.py
----------------------------------------------------------------------
diff --git a/be/src/codegen/gen_ir_descriptions.py b/be/src/codegen/gen_ir_descriptions.py
index be4be80..3cc195c 100755
--- a/be/src/codegen/gen_ir_descriptions.py
+++ b/be/src/codegen/gen_ir_descriptions.py
@@ -185,6 +185,7 @@ ir_functions = [
   ["STRING_TO_INT64", "IrStringToInt64"],
   ["STRING_TO_FLOAT", "IrStringToFloat"],
   ["STRING_TO_DOUBLE", "IrStringToDouble"],
+  ["STRING_TO_TIMESTAMP", "IrStringToTimestamp"],
   ["IS_NULL_STRING", "IrIsNullString"],
   ["GENERIC_IS_NULL_STRING", "IrGenericIsNullString"],
   ["RAW_VALUE_COMPARE",

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d61065d6/be/src/exec/hdfs-scanner-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scanner-ir.cc b/be/src/exec/hdfs-scanner-ir.cc
index 96aad07..143adb7 100644
--- a/be/src/exec/hdfs-scanner-ir.cc
+++ b/be/src/exec/hdfs-scanner-ir.cc
@@ -117,6 +117,12 @@ double IrStringToDouble(const char* s, int len, StringParser::ParseResult* resul
 }
 
 extern "C"
+TimestampValue IrStringToTimestamp(const char* s, int len,
+    StringParser::ParseResult* result) {
+  return StringParser::StringToTimestamp(s, len, result);
+}
+
+extern "C"
 bool IrIsNullString(const char* data, int len) {
   return data == NULL || (len == 2 && data[0] == '\\' && data[1] == 'N');
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d61065d6/be/src/exec/hdfs-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scanner.cc b/be/src/exec/hdfs-scanner.cc
index 10e4edd..23fcc20 100644
--- a/be/src/exec/hdfs-scanner.cc
+++ b/be/src/exec/hdfs-scanner.cc
@@ -311,9 +311,6 @@ Status HdfsScanner::CodegenWriteCompleteTuple(HdfsScanNodeBase* node,
   // TODO: Timestamp is not yet supported
   for (int i = 0; i < node->materialized_slots().size(); ++i) {
     SlotDescriptor* slot_desc = node->materialized_slots()[i];
-    if (slot_desc->type().type == TYPE_TIMESTAMP) {
-      return Status::Expected("Timestamp not yet supported for codegen.");
-    }
     if (slot_desc->type().type == TYPE_DECIMAL) {
       return Status::Expected("Decimal not yet supported for codegen.");
     }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d61065d6/be/src/exec/text-converter.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/text-converter.cc b/be/src/exec/text-converter.cc
index e12be85..db390c4 100644
--- a/be/src/exec/text-converter.cc
+++ b/be/src/exec/text-converter.cc
@@ -220,6 +220,9 @@ Function* TextConverter::CodegenWriteSlot(LlvmCodeGen* codegen,
       case TYPE_DOUBLE:
         parse_fn_enum = IRFunction::STRING_TO_DOUBLE;
         break;
+      case TYPE_TIMESTAMP:
+        parse_fn_enum = IRFunction::STRING_TO_TIMESTAMP;
+        break;
       default:
         DCHECK(false);
         return NULL;
@@ -234,9 +237,18 @@ Function* TextConverter::CodegenWriteSlot(LlvmCodeGen* codegen,
     LlvmCodeGen::NamedVariable parse_result("parse_result", codegen->GetType(TYPE_INT));
     Value* parse_result_ptr = codegen->CreateEntryBlockAlloca(fn, parse_result);
 
+    Value* parse_return;
     // Call Impala's StringTo* function
-    Value* result = builder.CreateCall(parse_fn,
-        ArrayRef<Value*>({args[1], args[2], parse_result_ptr}));
+    if (parse_fn->arg_size() == 3) {
+      parse_return = builder.CreateCall(parse_fn, {args[1], args[2], parse_result_ptr});
+    } else {
+      DCHECK(parse_fn->arg_size() == 4);
+      // If the return value is large (more than 16 bytes in our toolchain) the first
+      // parameter would be a pointer to value parsed and the return value of callee
+      // should be ignored
+      builder.CreateCall(parse_fn, {slot, args[1], args[2], parse_result_ptr});
+      parse_return = nullptr;
+    }
     Value* parse_result_val = builder.CreateLoad(parse_result_ptr, "parse_result");
     Value* failed_value = codegen->GetIntConstant(TYPE_INT, StringParser::PARSE_FAILURE);
 
@@ -254,7 +266,8 @@ Function* TextConverter::CodegenWriteSlot(LlvmCodeGen* codegen,
 
     // Parse succeeded
     builder.SetInsertPoint(parse_success_block);
-    builder.CreateStore(result, slot);
+    // If the parsed value is in parse_return, move it into slot
+    if (parse_fn->arg_size() == 3) builder.CreateStore(parse_return, slot);
     builder.CreateRet(codegen->true_value());
 
     // Parse failed, set slot to null and return false

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d61065d6/be/src/util/string-parser.h
----------------------------------------------------------------------
diff --git a/be/src/util/string-parser.h b/be/src/util/string-parser.h
index cc9c517..d6b94c7 100644
--- a/be/src/util/string-parser.h
+++ b/be/src/util/string-parser.h
@@ -25,6 +25,8 @@
 
 #include "common/logging.h"
 #include "runtime/decimal-value.h"
+#include "runtime/timestamp-parse-util.h"
+#include "runtime/timestamp-value.h"
 #include "util/decimal-util.h"
 
 namespace impala {
@@ -95,6 +97,15 @@ class StringParser {
     return StringToBoolInternal(s + i, len - i, result);
   }
 
+  /// Parse a TimestampValue from s.
+  static inline TimestampValue StringToTimestamp(const char* s, int len,
+      ParseResult* result) {
+    boost::gregorian::date d;
+    boost::posix_time::time_duration t;
+    *result = TimestampParser::Parse(s, len, &d, &t) ? PARSE_SUCCESS : PARSE_FAILURE;
+    return {d, t};
+  }
+
   /// Parses a decimal from s, returning the result.
   /// The parse status is returned in *result.
   /// On overflow or invalid values, the return value is undefined.