You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2018/03/26 22:49:15 UTC

[1/2] impala git commit: IMPALA-6614: ClientRequestState should use HS2 TOperationState

Repository: impala
Updated Branches:
  refs/heads/master cd939a241 -> 37565e381


IMPALA-6614: ClientRequestState should use HS2 TOperationState

Currently it uses beeswax's QueryState enum, but the TOperationState
is a superset. In order to remove dependencies on beeswax, and also
set things up for a future change to use the TOperationState explicit
CANCELED_STATE (see IMPALA-1262), migrate CLR to use TOperationState.

The intent of this change is to make no client visible change.

Change-Id: I36287eaf8f1dac23c306b470f95f379dfdc6bb5b
Reviewed-on: http://gerrit.cloudera.org:8080/9501
Reviewed-by: Dan Hecht <dh...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/66e5a121
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/66e5a121
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/66e5a121

Branch: refs/heads/master
Commit: 66e5a1212aaab387035a4cce2b1d233a15fdb287
Parents: cd939a2
Author: Dan Hecht <dh...@cloudera.com>
Authored: Mon Mar 5 16:08:28 2018 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Mon Mar 26 22:40:44 2018 +0000

----------------------------------------------------------------------
 be/src/service/client-request-state.cc  | 55 ++++++++++++++++++----------
 be/src/service/client-request-state.h   | 54 ++++++++++++++++-----------
 be/src/service/impala-beeswax-server.cc | 11 +++---
 be/src/service/impala-hs2-server.cc     | 19 ++--------
 be/src/service/impala-http-handler.cc   |  3 +-
 be/src/service/impala-server.cc         |  9 +++--
 6 files changed, 84 insertions(+), 67 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/66e5a121/be/src/service/client-request-state.cc
----------------------------------------------------------------------
diff --git a/be/src/service/client-request-state.cc b/be/src/service/client-request-state.cc
index 16ea3e0..ac6178c 100644
--- a/be/src/service/client-request-state.cc
+++ b/be/src/service/client-request-state.cc
@@ -101,7 +101,7 @@ ClientRequestState::ClientRequestState(
       TimePrecision::Nanosecond));
   summary_profile_->AddInfoString("End Time", "");
   summary_profile_->AddInfoString("Query Type", "N/A");
-  summary_profile_->AddInfoString("Query State", PrintQueryState(query_state_));
+  summary_profile_->AddInfoString("Query State", PrintQueryState(BeeswaxQueryState()));
   summary_profile_->AddInfoString("Query Status", "OK");
   summary_profile_->AddInfoString("Impala Version", GetVersionString(/* compact */ true));
   summary_profile_->AddInfoString("User", effective_user());
@@ -214,7 +214,7 @@ Status ClientRequestState::Exec(TExecRequest* exec_request) {
     }
     default:
       stringstream errmsg;
-      errmsg << "Unknown  exec request stmt type: " << exec_request_.stmt_type;
+      errmsg << "Unknown exec request stmt type: " << exec_request_.stmt_type;
       return Status(errmsg.str());
   }
 }
@@ -644,8 +644,11 @@ void ClientRequestState::Wait() {
     discard_result(UpdateQueryStatus(status));
   }
   if (status.ok()) {
-    UpdateNonErrorQueryState(beeswax::QueryState::FINISHED);
+    UpdateNonErrorOperationState(TOperationState::FINISHED_STATE);
   }
+  // UpdateQueryStatus() or UpdateNonErrorOperationState() have updated operation_state_.
+  DCHECK(operation_state_ == TOperationState::FINISHED_STATE ||
+      operation_state_ == TOperationState::ERROR_STATE);
 }
 
 Status ClientRequestState::WaitInternal() {
@@ -720,17 +723,18 @@ Status ClientRequestState::RestartFetch() {
   return Status::OK();
 }
 
-void ClientRequestState::UpdateNonErrorQueryState(
-    beeswax::QueryState::type query_state) {
+void ClientRequestState::UpdateNonErrorOperationState(
+    TOperationState::type operation_state) {
   lock_guard<mutex> l(lock_);
-  DCHECK(query_state != beeswax::QueryState::EXCEPTION);
-  if (query_state_ < query_state) UpdateQueryState(query_state);
+  DCHECK(operation_state == TOperationState::RUNNING_STATE
+      || operation_state == TOperationState::FINISHED_STATE);
+  if (operation_state_ < operation_state) UpdateOperationState(operation_state);
 }
 
 Status ClientRequestState::UpdateQueryStatus(const Status& status) {
   // Preserve the first non-ok status
   if (!status.ok() && query_status_.ok()) {
-    UpdateQueryState(beeswax::QueryState::EXCEPTION);
+    UpdateOperationState(TOperationState::ERROR_STATE);
     query_status_ = status;
     summary_profile_->AddInfoStringRedacted("Query Status", query_status_.GetDetail());
   }
@@ -740,12 +744,13 @@ Status ClientRequestState::UpdateQueryStatus(const Status& status) {
 
 Status ClientRequestState::FetchRowsInternal(const int32_t max_rows,
     QueryResultSet* fetched_rows) {
-  DCHECK(query_state_ != beeswax::QueryState::EXCEPTION);
+  // Wait() guarantees that we've transitioned at least to FINISHED_STATE (and any
+  // state beyond that should have a non-OK query_status_ set).
+  DCHECK(operation_state_ == TOperationState::FINISHED_STATE);
 
   if (eos_) return Status::OK();
 
   if (request_result_set_ != NULL) {
-    UpdateQueryState(beeswax::QueryState::FINISHED);
     int num_rows = 0;
     const vector<TResultRow>& all_rows = (*(request_result_set_.get()));
     // max_rows <= 0 means no limit
@@ -773,9 +778,6 @@ Status ClientRequestState::FetchRowsInternal(const int32_t max_rows,
     if (num_rows_fetched_from_cache >= max_rows) return Status::OK();
   }
 
-  // results will be ready after this call
-  UpdateQueryState(beeswax::QueryState::FINISHED);
-
   // Maximum number of rows to be fetched from the coord.
   int32_t max_coord_rows = max_rows;
   if (max_rows > 0) {
@@ -876,13 +878,13 @@ Status ClientRequestState::Cancel(bool check_inflight, const Status* cause) {
   Coordinator* coord;
   {
     lock_guard<mutex> lock(lock_);
-    // If the query is completed or cancelled, no need to update state.
-    bool already_done = eos_ || query_state_ == beeswax::QueryState::EXCEPTION;
+    // If the query has reached a terminal state, no need to update the state.
+    bool already_done = eos_ || operation_state_ == TOperationState::ERROR_STATE;
     if (!already_done && cause != NULL) {
       DCHECK(!cause->ok());
       discard_result(UpdateQueryStatus(*cause));
       query_events_->MarkEvent("Cancelled");
-      DCHECK_EQ(query_state_, beeswax::QueryState::EXCEPTION);
+      DCHECK_EQ(operation_state_, TOperationState::ERROR_STATE);
     }
     // Get a copy of the coordinator pointer while holding 'lock_'.
     coord = coord_.get();
@@ -1101,10 +1103,23 @@ void ClientRequestState::ClearResultCache() {
   result_cache_.reset(NULL);
 }
 
-void ClientRequestState::UpdateQueryState(
-    beeswax::QueryState::type query_state) {
-  query_state_ = query_state;
-  summary_profile_->AddInfoString("Query State", PrintQueryState(query_state_));
+void ClientRequestState::UpdateOperationState(
+    TOperationState::type operation_state) {
+  operation_state_ = operation_state;
+  summary_profile_->AddInfoString("Query State", PrintQueryState(BeeswaxQueryState()));
+}
+
+beeswax::QueryState::type ClientRequestState::BeeswaxQueryState() const {
+  switch (operation_state_) {
+    case TOperationState::INITIALIZED_STATE: return beeswax::QueryState::CREATED;
+    case TOperationState::RUNNING_STATE: return beeswax::QueryState::RUNNING;
+    case TOperationState::FINISHED_STATE: return beeswax::QueryState::FINISHED;
+    case TOperationState::ERROR_STATE: return beeswax::QueryState::EXCEPTION;
+    default: {
+      DCHECK(false) << "Add explicit translation for all used TOperationState values";
+      return beeswax::QueryState::EXCEPTION;
+    }
+  }
 }
 
 }

http://git-wip-us.apache.org/repos/asf/impala/blob/66e5a121/be/src/service/client-request-state.h
----------------------------------------------------------------------
diff --git a/be/src/service/client-request-state.h b/be/src/service/client-request-state.h
index 0341ec5..0c05bc4 100644
--- a/be/src/service/client-request-state.h
+++ b/be/src/service/client-request-state.h
@@ -28,7 +28,7 @@
 #include "util/condition-variable.h"
 #include "util/runtime-profile.h"
 #include "gen-cpp/Frontend_types.h"
-#include "gen-cpp/Frontend_types.h"
+#include "gen-cpp/ImpalaHiveServer2Service.h"
 
 #include <boost/thread.hpp>
 #include <boost/unordered_set.hpp>
@@ -74,8 +74,8 @@ class ClientRequestState {
   Status Exec(const TMetadataOpRequest& exec_request) WARN_UNUSED_RESULT;
 
   /// Call this to ensure that rows are ready when calling FetchRows(). Updates the
-  /// query_status_, and advances query_state_ to FINISHED or EXCEPTION. Must be preceded
-  /// by call to Exec(). Waits for all child queries to complete. Takes lock_.
+  /// query_status_, and advances operation_state_ to FINISHED or EXCEPTION. Must be
+  /// preceded by call to Exec(). Waits for all child queries to complete. Takes lock_.
   void Wait();
 
   /// Calls Wait() asynchronously in a thread and returns immediately.
@@ -93,7 +93,7 @@ class ClientRequestState {
   /// Caller needs to hold fetch_rows_lock_ and lock_.
   /// Caller should verify that EOS has not be reached before calling.
   /// Must be preceeded by call to Wait() (or WaitAsync()/BlockOnWait()).
-  /// Also updates query_state_/status_ in case of error.
+  /// Also updates operation_state_/query_status_ in case of error.
   Status FetchRows(const int32_t max_rows, QueryResultSet* fetched_rows)
       WARN_UNUSED_RESULT;
 
@@ -106,11 +106,12 @@ class ClientRequestState {
   /// The caller must hold fetch_rows_lock_ and lock_.
   Status RestartFetch() WARN_UNUSED_RESULT;
 
-  /// Update query state if the requested state isn't already obsolete. This is only for
-  /// non-error states - if the query encounters an error the query status needs to be set
-  /// with information about the error so UpdateQueryStatus must be used instead.
-  /// Takes lock_.
-  void UpdateNonErrorQueryState(beeswax::QueryState::type query_state);
+  /// Update operation state if the requested state isn't already obsolete. This is
+  /// only for non-error states - if the query encounters an error the query status
+  /// needs to be set with information about the error so UpdateQueryStatus() must be
+  /// used instead. Takes lock_.
+  void UpdateNonErrorOperationState(
+      apache::hive::service::cli::thrift::TOperationState::type operation_state);
 
   /// Update the query status and the "Query Status" summary profile string.
   /// If current status is already != ok, no update is made (we preserve the first error)
@@ -124,7 +125,7 @@ class ClientRequestState {
 
   /// Cancels the child queries and the coordinator with the given cause.
   /// If cause is NULL, assume this was deliberately cancelled by the user.
-  /// Otherwise, sets state to EXCEPTION.
+  /// Otherwise, sets state to ERROR_STATE (TODO: use CANCELED_STATE).
   /// Does nothing if the query has reached EOS or already cancelled.
   ///
   /// Only returns an error if 'check_inflight' is true and the query is not yet
@@ -137,7 +138,7 @@ class ClientRequestState {
   void Done();
 
   /// Sets the API-specific (Beeswax, HS2) result cache and its size bound.
-  /// The given cache is owned by this query exec state, even if an error is returned.
+  /// The given cache is owned by this client request state, even if an error is returned.
   /// Returns a non-ok status if max_size exceeds the per-impalad allowed maximum.
   Status SetResultCache(QueryResultSet* cache, int64_t max_size) WARN_UNUSED_RESULT;
 
@@ -179,7 +180,12 @@ class ClientRequestState {
   }
   boost::mutex* lock() { return &lock_; }
   boost::mutex* fetch_rows_lock() { return &fetch_rows_lock_; }
-  beeswax::QueryState::type query_state() const { return query_state_; }
+  apache::hive::service::cli::thrift::TOperationState::type operation_state() const {
+    return operation_state_;
+  }
+  // Translate operation_state() to a beeswax::QueryState. TODO: remove calls to this
+  // and replace with uses of operation_state() directly.
+  beeswax::QueryState::type BeeswaxQueryState() const;
   const Status& query_status() const { return query_status_; }
   void set_result_metadata(const TResultSetMetadata& md) { result_metadata_ = md; }
   void set_user_profile_access(bool user_has_profile_access) {
@@ -335,15 +341,20 @@ class ClientRequestState {
 
   bool is_cancelled_ = false; // if true, Cancel() was called.
   bool eos_ = false;  // if true, there are no more rows to return
-  /// We enforce the invariant that query_status_ is not OK iff query_state_ is EXCEPTION,
-  /// given that lock_ is held. query_state_ should only be updated using
-  /// UpdateQueryState(), to ensure that the query profile is also updated.
-  beeswax::QueryState::type query_state_ = beeswax::QueryState::CREATED;
+
+  /// We enforce the invariant that query_status_ is not OK iff operation_state_ is
+  /// ERROR_STATE, given that lock_ is held. operation_state_ should only be updated
+  /// using UpdateOperationState(), to ensure that the query profile is also updated.
+  apache::hive::service::cli::thrift::TOperationState::type operation_state_ =
+      apache::hive::service::cli::thrift::TOperationState::INITIALIZED_STATE;
+
   Status query_status_;
   TExecRequest exec_request_;
+
   /// If true, effective_user() has access to the runtime profile and execution
   /// summary.
   bool user_has_profile_access_ = true;
+
   TResultSetMetadata result_metadata_; // metadata for select query
   RowBatch* current_batch_ = nullptr; // the current row batch; only applicable if coord is set
   int current_batch_row_ = 0 ; // number of rows fetched within the current batch
@@ -395,10 +406,10 @@ class ClientRequestState {
   /// Executes a LOAD DATA
   Status ExecLoadDataRequest() WARN_UNUSED_RESULT;
 
-  /// Core logic of Wait(). Does not update query_state_/status_.
+  /// Core logic of Wait(). Does not update operation_state_/query_status_.
   Status WaitInternal() WARN_UNUSED_RESULT;
 
-  /// Core logic of FetchRows(). Does not update query_state_/status_.
+  /// Core logic of FetchRows(). Does not update operation_state_/query_status_.
   /// Caller needs to hold fetch_rows_lock_ and lock_.
   Status FetchRowsInternal(const int32_t max_rows, QueryResultSet* fetched_rows)
       WARN_UNUSED_RESULT;
@@ -436,10 +447,11 @@ class ClientRequestState {
   /// This function is a no-op if the cache has already been cleared.
   void ClearResultCache();
 
-  /// Update the query state and the "Query State" summary profile string.
+  /// Update the operation state and the "Query State" summary profile string.
   /// Does not take lock_, but requires it: caller must ensure lock_
-  /// is taken before calling UpdateQueryState.
-  void UpdateQueryState(beeswax::QueryState::type query_state);
+  /// is taken before calling UpdateOperationState.
+  void UpdateOperationState(
+      apache::hive::service::cli::thrift::TOperationState::type operation_state);
 
   /// Gets the query options, their values and levels and populates the result set
   /// with them. It covers the subset of options for 'SET' and all of them for

http://git-wip-us.apache.org/repos/asf/impala/blob/66e5a121/be/src/service/impala-beeswax-server.cc
----------------------------------------------------------------------
diff --git a/be/src/service/impala-beeswax-server.cc b/be/src/service/impala-beeswax-server.cc
index 2ac80b2..b827ff3 100644
--- a/be/src/service/impala-beeswax-server.cc
+++ b/be/src/service/impala-beeswax-server.cc
@@ -65,7 +65,7 @@ void ImpalaServer::query(QueryHandle& query_handle, const Query& query) {
   RAISE_IF_ERROR(Execute(&query_ctx, session, &request_state),
       SQLSTATE_SYNTAX_ERROR_OR_ACCESS_VIOLATION);
 
-  request_state->UpdateNonErrorQueryState(beeswax::QueryState::RUNNING);
+  request_state->UpdateNonErrorOperationState(TOperationState::RUNNING_STATE);
   // start thread to wait for results to become available, which will allow
   // us to advance query state to FINISHED or EXCEPTION
   Status status = request_state->WaitAsync();
@@ -110,7 +110,7 @@ void ImpalaServer::executeAndWait(QueryHandle& query_handle, const Query& query,
   RAISE_IF_ERROR(Execute(&query_ctx, session, &request_state),
       SQLSTATE_SYNTAX_ERROR_OR_ACCESS_VIOLATION);
 
-  request_state->UpdateNonErrorQueryState(beeswax::QueryState::RUNNING);
+  request_state->UpdateNonErrorOperationState(TOperationState::RUNNING_STATE);
   // Once the query is running do a final check for session closure and add it to the
   // set of in-flight queries.
   Status status = SetQueryInflight(session, request_state);
@@ -255,9 +255,10 @@ beeswax::QueryState::type ImpalaServer::get_state(const QueryHandle& handle) {
   // Take the lock to ensure that if the client sees a query_state == EXCEPTION, it is
   // guaranteed to see the error query_status.
   lock_guard<mutex> l(*request_state->lock());
-  DCHECK_EQ(request_state->query_state() == beeswax::QueryState::EXCEPTION,
+  beeswax::QueryState::type query_state = request_state->BeeswaxQueryState();
+  DCHECK_EQ(query_state == beeswax::QueryState::EXCEPTION,
       !request_state->query_status().ok());
-  return request_state->query_state();
+  return query_state;
 }
 
 void ImpalaServer::echo(string& echo_string, const string& input_string) {
@@ -293,7 +294,7 @@ void ImpalaServer::get_log(string& log, const LogContextId& context) {
     // Take the lock to ensure that if the client sees a query_state == EXCEPTION, it is
     // guaranteed to see the error query_status.
     lock_guard<mutex> l(*request_state->lock());
-    DCHECK_EQ(request_state->query_state() == beeswax::QueryState::EXCEPTION,
+    DCHECK_EQ(request_state->BeeswaxQueryState() == beeswax::QueryState::EXCEPTION,
         !request_state->query_status().ok());
     // If the query status is !ok, include the status error message at the top of the log.
     if (!request_state->query_status().ok()) {

http://git-wip-us.apache.org/repos/asf/impala/blob/66e5a121/be/src/service/impala-hs2-server.cc
----------------------------------------------------------------------
diff --git a/be/src/service/impala-hs2-server.cc b/be/src/service/impala-hs2-server.cc
index e495dec..4381f81 100644
--- a/be/src/service/impala-hs2-server.cc
+++ b/be/src/service/impala-hs2-server.cc
@@ -86,18 +86,6 @@ namespace impala {
 
 const string IMPALA_RESULT_CACHING_OPT = "impala.resultset.cache.size";
 
-// Helper function to translate between Beeswax and HiveServer2 type
-static TOperationState::type QueryStateToTOperationState(
-    const beeswax::QueryState::type& query_state) {
-  switch (query_state) {
-    case beeswax::QueryState::CREATED: return TOperationState::INITIALIZED_STATE;
-    case beeswax::QueryState::RUNNING: return TOperationState::RUNNING_STATE;
-    case beeswax::QueryState::FINISHED: return TOperationState::FINISHED_STATE;
-    case beeswax::QueryState::EXCEPTION: return TOperationState::ERROR_STATE;
-    default: return TOperationState::UKNOWN_STATE;
-  }
-}
-
 void ImpalaServer::ExecuteMetadataOp(const THandleIdentifier& session_handle,
     TMetadataOpRequest* request, TOperationHandle* handle, thrift::TStatus* status) {
   TUniqueId session_id;
@@ -161,7 +149,7 @@ void ImpalaServer::ExecuteMetadataOp(const THandleIdentifier& session_handle,
     return;
   }
 
-  request_state->UpdateNonErrorQueryState(beeswax::QueryState::FINISHED);
+  request_state->UpdateNonErrorOperationState(TOperationState::FINISHED_STATE);
 
   Status inflight_status = SetQueryInflight(session, request_state);
   if (!inflight_status.ok()) {
@@ -466,7 +454,7 @@ void ImpalaServer::ExecuteStatement(TExecuteStatementResp& return_val,
         cache_num_rows);
     if (!status.ok()) goto return_error;
   }
-  request_state->UpdateNonErrorQueryState(beeswax::QueryState::RUNNING);
+  request_state->UpdateNonErrorOperationState(TOperationState::RUNNING_STATE);
   // Start thread to wait for results to become available.
   status = request_state->WaitAsync();
   if (!status.ok()) goto return_error;
@@ -651,8 +639,7 @@ void ImpalaServer::GetOperationStatus(TGetOperationStatusResp& return_val,
 
   {
     lock_guard<mutex> l(*request_state->lock());
-    TOperationState::type operation_state = QueryStateToTOperationState(
-        request_state->query_state());
+    TOperationState::type operation_state = request_state->operation_state();
     return_val.__set_operationState(operation_state);
     if (operation_state == TOperationState::ERROR_STATE) {
       DCHECK(!request_state->query_status().ok());

http://git-wip-us.apache.org/repos/asf/impala/blob/66e5a121/be/src/service/impala-http-handler.cc
----------------------------------------------------------------------
diff --git a/be/src/service/impala-http-handler.cc b/be/src/service/impala-http-handler.cc
index 0156023..43b03d1 100644
--- a/be/src/service/impala-http-handler.cc
+++ b/be/src/service/impala-http-handler.cc
@@ -42,6 +42,7 @@
 
 #include "common/names.h"
 
+using namespace apache::hive::service::cli::thrift;
 using namespace apache::thrift;
 using namespace beeswax;
 using namespace impala;
@@ -757,7 +758,7 @@ void ImpalaHttpHandler::QuerySummaryHandler(bool include_json_plan, bool include
       // state lock to be acquired, since it could potentially be an expensive
       // call, if the table Catalog metadata loading is in progress. Instead
       // update the caller that the plan information is unavailable.
-      if (request_state->query_state() == beeswax::QueryState::CREATED) {
+      if (request_state->operation_state() == TOperationState::INITIALIZED_STATE) {
         document->AddMember(
             "plan_metadata_unavailable", "true", document->GetAllocator());
         return;

http://git-wip-us.apache.org/repos/asf/impala/blob/66e5a121/be/src/service/impala-server.cc
----------------------------------------------------------------------
diff --git a/be/src/service/impala-server.cc b/be/src/service/impala-server.cc
index 66b8d0e..6c62a19 100644
--- a/be/src/service/impala-server.cc
+++ b/be/src/service/impala-server.cc
@@ -98,10 +98,11 @@ using boost::get_system_time;
 using boost::system_time;
 using boost::uuids::random_generator;
 using boost::uuids::uuid;
+using namespace apache::hive::service::cli::thrift;
 using namespace apache::thrift;
 using namespace apache::thrift::transport;
-using namespace boost::posix_time;
 using namespace beeswax;
+using namespace boost::posix_time;
 using namespace rapidjson;
 using namespace strings;
 
@@ -610,8 +611,8 @@ Status ImpalaServer::GetRuntimeProfileStr(const TUniqueId& query_id,
   {
     shared_ptr<ClientRequestState> request_state = GetClientRequestState(query_id);
     if (request_state.get() != nullptr) {
-      // For queries in CREATED state, the profile information isn't populated yet.
-      if (request_state->query_state() == beeswax::QueryState::CREATED) {
+      // For queries in INITIALIZED_STATE, the profile information isn't populated yet.
+      if (request_state->operation_state() == TOperationState::INITIALIZED_STATE) {
         return Status::Expected("Query plan is not ready.");
       }
       lock_guard<mutex> l(*request_state->lock());
@@ -1684,7 +1685,7 @@ ImpalaServer::QueryStateRecord::QueryStateRecord(const ClientRequestState& reque
     total_fragments = coord->progress().total();
     has_coord = true;
   }
-  query_state = request_state.query_state();
+  query_state = request_state.BeeswaxQueryState();
   num_rows_fetched = request_state.num_rows_fetched();
   query_status = request_state.query_status();
 


[2/2] impala git commit: IMPALA-6731: Use private index in bootstrap_virtualenv

Posted by ta...@apache.org.
IMPALA-6731: Use private index in bootstrap_virtualenv

This change switches to using a private pypi index url when using a
private pypi mirror. This allows to run the tests without relying on the
public Python pypi mirrors.

Some packages can not detect their dependencies correctly when they get
installed together with the dependencies in the same call to pip. This
change adds a second stage of package installation to separate these
packages from their dependencies.

It also adds a few missing packages and updates some packages to newer
versions.

Testing: Ran this on a box where I blocked DNS resolution to Python's
upstream pypi.

Change-Id: I85f75f1f1a305f3043e0910ab88a880eeb30f00b
Reviewed-on: http://gerrit.cloudera.org:8080/9798
Reviewed-by: Philip Zeyliger <ph...@cloudera.com>
Tested-by: Lars Volker <lv...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/37565e38
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/37565e38
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/37565e38

Branch: refs/heads/master
Commit: 37565e38128173798223b363593228a915bd7beb
Parents: 66e5a12
Author: Lars Volker <lv...@cloudera.com>
Authored: Fri Mar 23 21:50:34 2018 -0700
Committer: Lars Volker <lv...@cloudera.com>
Committed: Mon Mar 26 22:45:03 2018 +0000

----------------------------------------------------------------------
 infra/python/bootstrap_virtualenv.py        | 33 ++++++++++++++++++++----
 infra/python/deps/compiled-requirements.txt |  5 ++--
 infra/python/deps/pip_download.py           |  5 ++--
 infra/python/deps/requirements.txt          | 11 ++++----
 infra/python/deps/stage2-requirements.txt   | 32 +++++++++++++++++++++++
 5 files changed, 71 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/37565e38/infra/python/bootstrap_virtualenv.py
----------------------------------------------------------------------
diff --git a/infra/python/bootstrap_virtualenv.py b/infra/python/bootstrap_virtualenv.py
index e26aaf3..fe7695d 100644
--- a/infra/python/bootstrap_virtualenv.py
+++ b/infra/python/bootstrap_virtualenv.py
@@ -21,6 +21,8 @@
 # A multi-step bootstrapping process is required to build and install all of the
 # dependencies:
 # 1. install basic non-C/C++ packages into the virtualenv
+# 1b. install packages that depend on step 1 but cannot be installed together with their
+#     dependencies
 # 2. use the virtualenv Python to bootstrap the toolchain
 # 3. use toolchain gcc to build C/C++ packages
 # 4. build the kudu-python package with toolchain gcc and Cython
@@ -49,9 +51,13 @@ LOG = logging.getLogger(os.path.splitext(os.path.basename(__file__))[0])
 DEPS_DIR = os.path.join(os.path.dirname(__file__), "deps")
 ENV_DIR = os.path.join(os.path.dirname(__file__), "env")
 
-# Generated using "pip install --download <DIR> -r requirements.txt"
+# Requirements file with packages we need for our build and tests.
 REQS_PATH = os.path.join(DEPS_DIR, "requirements.txt")
 
+# Second stage of requirements which cannot be installed together with their dependencies
+# in requirements.txt.
+REQS2_PATH = os.path.join(DEPS_DIR, "stage2-requirements.txt")
+
 # Requirements for the next bootstrapping step that builds compiled requirements
 # with toolchain gcc.
 COMPILED_REQS_PATH = os.path.join(DEPS_DIR, "compiled-requirements.txt")
@@ -140,9 +146,23 @@ def exec_pip_install(args, cc="no-cc-available", env=None):
   #
   # --no-cache-dir is used to prevent caching of compiled artifacts, which may be built
   # with different compilers or settings.
-  exec_cmd([os.path.join(ENV_DIR, "bin", "python"), os.path.join(ENV_DIR, "bin", "pip"),
-    "install", "--no-binary", "--no-index", "--no-cache-dir", "--find-links",
-    "file://%s" % urllib.pathname2url(os.path.abspath(DEPS_DIR))] + args, env=env)
+  cmd = [os.path.join(ENV_DIR, "bin", "python"), os.path.join(ENV_DIR, "bin", "pip"),
+    "install", "-v", "--no-binary", "--no-cache-dir"]
+
+  # When using a custom mirror, we also must use the index of that mirror.
+  if "PYPI_MIRROR" in os.environ:
+    cmd.extend(["--index-url", "%s/simple" % os.environ["PYPI_MIRROR"]])
+  else:
+    # Prevent fetching additional packages from the index. If we forget to add a package
+    # to one of the requirements.txt files, this should trigger an error. However, we will
+    # still access the index for version/dependency resolution, hence we need to change it
+    # when using a private mirror.
+    cmd.append("--no-index")
+
+  cmd.extend(["--find-links",
+      "file://%s" % urllib.pathname2url(os.path.abspath(DEPS_DIR))])
+  cmd.extend(args)
+  exec_cmd(cmd, env=env)
 
 
 def find_file(*paths):
@@ -181,6 +201,9 @@ def install_deps():
   LOG.info("Installing packages into the virtualenv")
   exec_pip_install(["-r", REQS_PATH])
   mark_reqs_installed(REQS_PATH)
+  LOG.info("Installing stage 2 packages into the virtualenv")
+  exec_pip_install(["-r", REQS2_PATH])
+  mark_reqs_installed(REQS2_PATH)
 
 def have_toolchain():
   '''Return true if the Impala toolchain is available'''
@@ -335,7 +358,7 @@ def reqs_are_installed(reqs_path):
     installed_reqs_file.close()
 
 def setup_virtualenv_if_not_exists():
-  if not reqs_are_installed(REQS_PATH):
+  if not (reqs_are_installed(REQS_PATH) and reqs_are_installed(REQS2_PATH)):
     delete_virtualenv_if_exist()
     create_virtualenv()
     install_deps()

http://git-wip-us.apache.org/repos/asf/impala/blob/37565e38/infra/python/deps/compiled-requirements.txt
----------------------------------------------------------------------
diff --git a/infra/python/deps/compiled-requirements.txt b/infra/python/deps/compiled-requirements.txt
index b3f9f4d..289a78e 100644
--- a/infra/python/deps/compiled-requirements.txt
+++ b/infra/python/deps/compiled-requirements.txt
@@ -26,12 +26,13 @@ Fabric == 1.10.2
 impyla == 0.14.0
   bitarray == 0.8.1
   sasl == 0.1.3
-  six == 1.9.0
+  six == 1.11.0
   # Thrift usually comes from the thirdparty dir but in case the virtualenv is needed
   # before thirdparty is built thrift will be installed anyways.
   thrift == 0.9.0
-  thrift_sasl == 0.1.0
+  thrift-sasl == 0.1.0
 psutil == 0.7.1
 # Required for Kudu:
   Cython == 0.23.4
   numpy == 1.10.4
+  pytz == 2018.3

http://git-wip-us.apache.org/repos/asf/impala/blob/37565e38/infra/python/deps/pip_download.py
----------------------------------------------------------------------
diff --git a/infra/python/deps/pip_download.py b/infra/python/deps/pip_download.py
index 2e84426..6fbb683 100755
--- a/infra/python/deps/pip_download.py
+++ b/infra/python/deps/pip_download.py
@@ -35,8 +35,9 @@ NUM_DOWNLOAD_ATTEMPTS = 8
 PYPI_MIRROR = os.environ.get('PYPI_MIRROR', 'https://pypi.python.org')
 
 # The requirement files that list all of the required packages and versions.
-REQUIREMENTS_FILES = ['requirements.txt', 'compiled-requirements.txt',
-                      'kudu-requirements.txt', 'adls-requirements.txt']
+REQUIREMENTS_FILES = ['requirements.txt', 'stage2-requirements.txt',
+                      'compiled-requirements.txt', 'kudu-requirements.txt',
+                      'adls-requirements.txt']
 
 
 def check_digest(filename, algorithm, expected_digest):

http://git-wip-us.apache.org/repos/asf/impala/blob/37565e38/infra/python/deps/requirements.txt
----------------------------------------------------------------------
diff --git a/infra/python/deps/requirements.txt b/infra/python/deps/requirements.txt
index abf5d7d..bea16f4 100644
--- a/infra/python/deps/requirements.txt
+++ b/infra/python/deps/requirements.txt
@@ -27,6 +27,7 @@ boto3 == 1.2.3
   simplejson == 3.3.0 # For python version 2.6
   botocore == 1.3.30
   python_dateutil == 2.5.2
+    six == 1.11.0
   docutils == 0.12
   jmespath == 0.9.0
   futures == 3.0.5
@@ -47,13 +48,11 @@ pexpect == 3.3
 pg8000 == 1.10.2
 prettytable == 0.7.2
 pyparsing == 2.0.3
-pytest == 2.9.2
-  py == 1.4.32
-pytest-random == 0.02
-pytest-xdist == 1.15.0
 python-magic == 0.4.11
-pywebhdfs == 0.3.2
-  pbr == 1.8.1
+# pbr is required for pywebhdfs but must be installed in a separate call to pip before
+# attempting to install pywebhdfs (https://github.com/pywebhdfs/pywebhdfs/issues/52).
+# pywebhdfs itself will be installed in stage 2.
+  pbr == 3.1.1
 requests == 2.7.0
 # Newer versions of setuptools don't support Python 2.6
 setuptools == 36.8.0

http://git-wip-us.apache.org/repos/asf/impala/blob/37565e38/infra/python/deps/stage2-requirements.txt
----------------------------------------------------------------------
diff --git a/infra/python/deps/stage2-requirements.txt b/infra/python/deps/stage2-requirements.txt
new file mode 100644
index 0000000..eda2cd3
--- /dev/null
+++ b/infra/python/deps/stage2-requirements.txt
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file contains packages that have dependencies in requirements.txt and that have to
+# be installed in a separate invocation of pip.
+
+# Remember, all modules below need to support python 2.6.
+
+# Requires setuptools-scm
+pytest == 2.9.2
+  py == 1.4.32
+  pytest-forked == 0.2
+  pytest-random == 0.02
+  pytest-runner == 4.2
+  pytest-xdist == 1.17.1
+
+# Requires pbr
+pywebhdfs == 0.3.2