You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@impala.apache.org by jo...@apache.org on 2018/10/17 21:32:17 UTC

[1/5] impala git commit: IMPALA-7715: [DOCS] Better descriptions for conditional functions

Repository: impala
Updated Branches:
  refs/heads/master 5e92d139b -> 6399a65a0


IMPALA-7715: [DOCS] Better descriptions for conditional functions

- Updated the descriptions for ISTRUE, ISFALSE, NONULVALUE, NULLVALUE.
- Updated several function names to use caps.

Change-Id: I5cc90d62645730d2674bcb3af614863aa92b92f6
Reviewed-on: http://gerrit.cloudera.org:8080/11704
Tested-by: Impala Public Jenkins <im...@cloudera.com>
Reviewed-by: Paul Rogers <pa...@yahoo.com>
Reviewed-by: Alex Rodoni <ar...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/a0f351a6
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/a0f351a6
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/a0f351a6

Branch: refs/heads/master
Commit: a0f351a647b7f48468e6dd3877a16947a746ab9c
Parents: 5e92d13
Author: Alex Rodoni <ar...@cloudera.com>
Authored: Tue Oct 16 15:45:49 2018 -0700
Committer: Alex Rodoni <ar...@cloudera.com>
Committed: Wed Oct 17 18:49:55 2018 +0000

----------------------------------------------------------------------
 docs/shared/impala_common.xml                | 116 ++++++++++------------
 docs/topics/impala_conditional_functions.xml |  98 ++++++++++++------
 2 files changed, 120 insertions(+), 94 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/a0f351a6/docs/shared/impala_common.xml
----------------------------------------------------------------------
diff --git a/docs/shared/impala_common.xml b/docs/shared/impala_common.xml
index 847bc9a..cb4be6c 100644
--- a/docs/shared/impala_common.xml
+++ b/docs/shared/impala_common.xml
@@ -811,59 +811,49 @@ The above raises an error at parse-time.
         HBase tables.
       </p>
 
-      <p id="boolean_functions_vs_expressions" rev="2.11.0 IMPALA-1767">
-        In <keyword keyref="impala211_full"/> and higher, you can use
-        the operators <codeph>IS [NOT] TRUE</codeph> and
-        <codeph>IS [NOT] FALSE</codeph> as equivalents for the built-in
-        functions <codeph>istrue()</codeph>, <codeph>isnottrue()</codeph>,
-        <codeph>isfalse()</codeph>, and <codeph>isnotfalse()</codeph>.
-      </p>
-
-      <p id="base64_charset">
-        The set of characters that can be generated as output
-        from <codeph>base64encode()</codeph>, or specified in
-        the argument string to <codeph>base64decode()</codeph>,
-        are the ASCII uppercase and lowercase letters (A-Z, a-z),
-        digits (0-9), and the punctuation characters
-        <codeph>+</codeph>, <codeph>/</codeph>, and <codeph>=</codeph>.
-      </p>
-
-      <p id="base64_error_handling">
-        If the argument string to <codeph>base64decode()</codeph> does
-        not represent a valid base64-encoded value, subject to the
-        constraints of the Impala implementation such as the allowed
-        character set, the function returns <codeph>NULL</codeph>.
-      </p>
-
-      <p id="base64_use_cases">
-        The functions <codeph>base64encode()</codeph> and
-        <codeph>base64decode()</codeph> are typically used
-        in combination, to store in an Impala table string data that is
-        problematic to store or transmit. For example, you could use
-        these functions to store string data that uses an encoding
-        other than UTF-8, or to transform the values in contexts that
-        require ASCII values, such as for partition key columns.
-        Keep in mind that base64-encoded values produce different results
-        for string functions such as <codeph>LENGTH()</codeph>,
-        <codeph>MAX()</codeph>, and <codeph>MIN()</codeph> than when
-        those functions are called with the unencoded string values.
-      </p>
-
-      <p id="base64_alignment">
-        All return values produced by <codeph>base64encode()</codeph>
-        are a multiple of 4 bytes in length. All argument values
-        supplied to <codeph>base64decode()</codeph> must also be a
-        multiple of 4 bytes in length. If a base64-encoded value
-        would otherwise have a different length, it can be padded
-        with trailing <codeph>=</codeph> characters to reach a length
-        that is a multiple of 4 bytes.
-      </p>
-
-      <p id="base64_examples">
-        The following examples show how to use <codeph>base64encode()</codeph>
-        and <codeph>base64decode()</codeph> together to store and retrieve
-        string values:
-<codeblock>
+      <p id="boolean_functions_vs_expressions" rev="2.11.0 IMPALA-1767"> In
+          <keyword keyref="impala211_full"/> and higher, you can use the
+        operators <codeph>IS [NOT] TRUE</codeph> and <codeph>IS [NOT]
+          FALSE</codeph> as equivalents for the built-in functions
+          <codeph>ISTRUE()</codeph>, <codeph>ISNOTTRUE()</codeph>,
+          <codeph>ISFALSE()</codeph>, and <codeph>ISNOTFALSE()</codeph>. </p>
+
+      <p id="base64_charset"> The set of characters that can be generated as
+        output from <codeph>BASE64ENCODE()</codeph>, or specified in the
+        argument string to <codeph>BASE64DECODE()</codeph>, are the ASCII
+        uppercase and lowercase letters (A-Z, a-z), digits (0-9), and the
+        punctuation characters <codeph>+</codeph>, <codeph>/</codeph>, and
+          <codeph>=</codeph>. </p>
+
+      <p id="base64_error_handling"> If the argument string to
+          <codeph>BASE64DECODE()</codeph> does not represent a valid
+        base64-encoded value, subject to the constraints of the Impala
+        implementation such as the allowed character set, the function returns
+          <codeph>NULL</codeph>. </p>
+
+      <p id="base64_use_cases"> The functions <codeph>BASE64ENCODE()</codeph>
+        and <codeph>BASE64DECODE()</codeph> are typically used in combination,
+        to store in an Impala table string data that is problematic to store or
+        transmit. For example, you could use these functions to store string
+        data that uses an encoding other than UTF-8, or to transform the values
+        in contexts that require ASCII values, such as for partition key
+        columns. Keep in mind that base64-encoded values produce different
+        results for string functions such as <codeph>LENGTH()</codeph>,
+          <codeph>MAX()</codeph>, and <codeph>MIN()</codeph> than when those
+        functions are called with the unencoded string values. </p>
+
+      <p id="base64_alignment"> All return values produced by
+          <codeph>BASE64ENCODE()</codeph> are a multiple of 4 bytes in length.
+        All argument values supplied to <codeph>BASE64DECODE()</codeph> must
+        also be a multiple of 4 bytes in length. If a base64-encoded value would
+        otherwise have a different length, it can be padded with trailing
+          <codeph>=</codeph> characters to reach a length that is a multiple of
+        4 bytes. </p>
+
+      <p id="base64_examples"> The following examples show how to use
+          <codeph>BASE64ENCODE()</codeph> and <codeph>BASE64DECODE()</codeph>
+        together to store and retrieve string values:
+        <codeblock>
 -- An arbitrary string can be encoded in base 64.
 -- The length of the output is a multiple of 4 bytes,
 -- padded with trailing = characters if necessary.
@@ -884,11 +874,9 @@ select base64decode('aGVsbG8gd29ybGQ=') as decoded;
 | hello world |
 +-------------+
 </codeblock>
-
-      These examples demonstrate incorrect encoded values that
-      produce <codeph>NULL</codeph> return values when decoded:
-
-<codeblock>
+        These examples demonstrate incorrect encoded values that produce
+          <codeph>NULL</codeph> return values when decoded:
+        <codeblock>
 -- The input value to base64decode() must be a multiple of 4 bytes.
 -- In this case, leaving off the trailing = padding character
 -- produces a NULL return value.
@@ -911,13 +899,11 @@ select base64decode('abc$');
 +----------------------+
 WARNINGS: UDF WARNING: Could not base64 decode input in space 4; actual output length 0
 </codeblock>
-
-      These examples demonstrate <q>round-tripping</q> of an original string to an
-      encoded string, and back again. This technique is applicable if the original
-      source is in an unknown encoding, or if some intermediate processing stage
-      might cause national characters to be misrepresented:
-
-<codeblock>
+        These examples demonstrate <q>round-tripping</q> of an original string
+        to an encoded string, and back again. This technique is applicable if
+        the original source is in an unknown encoding, or if some intermediate
+        processing stage might cause national characters to be misrepresented:
+        <codeblock>
 select 'circumflex accents: â, ê, î, ô, û' as original,
   base64encode('circumflex accents: â, ê, î, ô, û') as encoded;
 +-----------------------------------+------------------------------------------------------+

http://git-wip-us.apache.org/repos/asf/impala/blob/a0f351a6/docs/topics/impala_conditional_functions.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_conditional_functions.xml b/docs/topics/impala_conditional_functions.xml
index 106c518..ddb824e 100644
--- a/docs/topics/impala_conditional_functions.xml
+++ b/docs/topics/impala_conditional_functions.xml
@@ -171,8 +171,9 @@ under the License.
         </dt>
 
         <dd>
-          <b>Purpose:</b> Tests whether any of a sequence of expressions is true, and returns a
-          corresponding result for the first true expression.
+          <b>Purpose:</b> Tests whether any of a sequence of expressions is
+          <codeph>TRUE</codeph>, and returns a corresponding result for the first true
+          expression.
           <p
             conref="../shared/impala_common.xml#common/return_same_type"/>
 
@@ -308,7 +309,8 @@ under the License.
 
         <dd>
           <b>Purpose:</b> Tests an expression and returns a corresponding result depending on
-          whether the result is true, false, or <codeph>NULL</codeph>.
+          whether the result is <codeph>TRUE</codeph>, <codeph>FALSE</codeph>, or
+          <codeph>NULL</codeph>.
           <p>
             <b>Return type:</b> Same as the <codeph>ifTrue</codeph> argument value
           </p>
@@ -337,12 +339,18 @@ under the License.
         </dt>
 
         <dd>
-          <b>Purpose:</b> Tests if a Boolean expression is <codeph>false</codeph> or not.
-          Returns <codeph>true</codeph> if so. If the argument is <codeph>NULL</codeph>, returns
-          <codeph>false</codeph>. Similar to <codeph>ISNOTTRUE()</codeph>, except it returns the
-          opposite value for a <codeph>NULL</codeph> argument.
-          <p
-            conref="../shared/impala_common.xml#common/return_type_boolean"/>
+          <b>Purpose:</b> Returns <codeph>TRUE</codeph> if the expression is
+          <codeph>FALSE</codeph>. Returns <codeph>FALSE</codeph> if the expression is
+          <codeph>TRUE</codeph> or <codeph>NULL</codeph>.
+          <p>
+            Same as the <codeph>IS FALSE</codeph> operator.
+          </p>
+          <p>
+            Similar to <codeph>ISNOTTRUE()</codeph>, except it returns the opposite value for a
+            <codeph>NULL</codeph> argument.
+          </p>
+
+          <p conref="../shared/impala_common.xml#common/return_type_boolean"/>
 
           <p conref="../shared/impala_common.xml#common/added_in_220"/>
 
@@ -362,11 +370,16 @@ under the License.
         </dt>
 
         <dd>
-          <b>Purpose:</b> Tests if a Boolean expression is not <codeph>false</codeph> (that is,
-          either <codeph>true</codeph> or <codeph>NULL</codeph>). Returns <codeph>true</codeph>
-          if so. If the argument is <codeph>NULL</codeph>, returns <codeph>true</codeph>.
-          Similar to <codeph>ISTRUE()</codeph>, except it returns the opposite value for a
-          <codeph>NULL</codeph> argument.
+          <b>Purpose:</b> Tests if a Boolean expression is not <codeph>FALSE</codeph> (that is,
+          either <codeph>TRUE</codeph> or <codeph>NULL</codeph>). Returns <codeph>TRUE</codeph>
+          if so. If the argument is <codeph>NULL</codeph>, returns <codeph>TRUE</codeph>.
+          <p>
+            Same as the <codeph>IS NOT FALSE</codeph> operator.
+          </p>
+          <p>
+            Similar to <codeph>ISTRUE()</codeph>, except it returns the opposite value for a
+            <codeph>NULL</codeph> argument.
+          </p>
           <p
             conref="../shared/impala_common.xml#common/return_type_boolean"/>
 
@@ -390,11 +403,16 @@ under the License.
         </dt>
 
         <dd>
-          <b>Purpose:</b> Tests if a Boolean expression is not <codeph>true</codeph> (that is,
-          either <codeph>false</codeph> or <codeph>NULL</codeph>). Returns <codeph>true</codeph>
-          if so. If the argument is <codeph>NULL</codeph>, returns <codeph>true</codeph>.
-          Similar to <codeph>ISFALSE()</codeph>, except it returns the opposite value for a
-          <codeph>NULL</codeph> argument.
+          <b>Purpose:</b> Tests if a Boolean expression is not <codeph>TRUE</codeph> (that is,
+          either <codeph>FALSE</codeph> or <codeph>NULL</codeph>). Returns <codeph>TRUE</codeph>
+          if so. If the argument is <codeph>NULL</codeph>, returns <codeph>TRUE</codeph>.
+          <p>
+            Same as the <codeph>IS NOT TRUE</codeph> operator.
+          </p>
+          <p>
+            Similar to <codeph>ISFALSE()</codeph>, except it returns the opposite value for a
+            <codeph>NULL</codeph> argument.
+          </p>
           <p
             conref="../shared/impala_common.xml#common/return_type_boolean"/>
 
@@ -439,12 +457,18 @@ under the License.
         </dt>
 
         <dd>
-          <b>Purpose:</b> Tests if a Boolean expression is <codeph>true</codeph> or not. Returns
-          <codeph>true</codeph> if so. If the argument is <codeph>NULL</codeph>, returns
-          <codeph>false</codeph>. Similar to <codeph>ISNOTFALSE()</codeph>, except it returns
-          the opposite value for a <codeph>NULL</codeph> argument.
-          <p
-            conref="../shared/impala_common.xml#common/return_type_boolean"/>
+          <b>Purpose:</b> Returns <codeph>TRUE</codeph> if the expression is
+          <codeph>TRUE</codeph>. Returns <codeph>FALSE</codeph> if the expression is
+          <codeph>FALSE</codeph> or <codeph>NULL</codeph>.
+          <p>
+            Same as the <codeph>IS TRUE</codeph> operator.
+          </p>
+          <p>
+            Similar to <codeph>ISNOTFALSE()</codeph>, except it returns the opposite value for a
+            <codeph>NULL</codeph> argument.
+          </p>
+
+          <p conref="../shared/impala_common.xml#common/return_type_boolean"/>
 
           <p conref="../shared/impala_common.xml#common/for_compatibility_only"/>
 
@@ -466,8 +490,16 @@ under the License.
         </dt>
 
         <dd>
-          <b>Purpose:</b> Tests if an expression (of any type) is <codeph>NULL</codeph> or not.
-          Returns <codeph>false</codeph> if so. The converse of <codeph>NULLVALUE()</codeph>.
+          <b>Purpose:</b> Returns <codeph>TRUE</codeph> if the expression is non-null and
+          returns <codeph>FALSE</codeph> if the expression is <codeph>NULL</codeph>.
+          <p>
+            Same as the <codeph>IS NOT NULL</codeph> operator.
+          </p>
+
+          <p>
+            The converse of <codeph>NULLVALUE()</codeph>.
+          </p>
+
           <p
             conref="../shared/impala_common.xml#common/return_type_boolean"/>
 
@@ -549,8 +581,16 @@ END</codeblock>
         </dt>
 
         <dd>
-          <b>Purpose:</b> Tests if an expression (of any type) is <codeph>NULL</codeph> or not.
-          Returns <codeph>true</codeph> if so. The converse of <codeph>NONNULLVALUE()</codeph>.
+          <b>Purpose:</b> Returns <codeph>TRUE</codeph> if the expression is
+          <codeph>NULL</codeph>, and returns <codeph>FALSE</codeph> otherwise.
+          <p>
+            Same as the <codeph>IS NULL</codeph> operator.
+          </p>
+
+          <p>
+            The converse of <codeph>NONNULLVALUE()</codeph>.
+          </p>
+
           <p
             conref="../shared/impala_common.xml#common/return_type_boolean"/>

[4/5] impala git commit: IMPALA-7644: Hide Parquet page index writing with feature flag

Posted by jo...@apache.org.

IMPALA-7644: Hide Parquet page index writing with feature flag

This commit adds the command line flag enable_parquet_page_index_writing
to the Impala daemon that switches Impala's ability of writing the
Parquet page index. By default the flag is false, i.e. Impala doesn't
write the page index.

This flag is only temporary, we plan to remove it once Impala is able to
read the page index and has better testing around it.

Because of this change I had to move test_parquet_page_index.py to the
custom_cluster test suite since I need to set this command line flag
in order to test the functionality. I also merged most of the test cases
because we don't want to restart the cluster too many times.

I removed 'num_data_pages_' from BaseColumnWriter since it was rather
confusing and didn't provide any measurable performance improvement.

This commit fixes the ASAN error produced by the first IMPALA-7644
commit which was reverted later.

Change-Id: Ib4a9098a2085a385351477c715ae245d83bf1c72
Reviewed-on: http://gerrit.cloudera.org:8080/11694
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/de7f09d7
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/de7f09d7
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/de7f09d7

Branch: refs/heads/master
Commit: de7f09d726240a32739d59fe16faec5792e7c7a3
Parents: 275124e
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
Authored: Tue Oct 2 14:11:58 2018 +0200
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Wed Oct 17 19:57:17 2018 +0000

----------------------------------------------------------------------
 be/src/common/global-flags.cc                   |   6 +
 be/src/exec/hdfs-parquet-table-writer.cc        | 144 +++----
 .../queries/QueryTest/stats-extrapolation.test  |  14 +-
 tests/custom_cluster/test_parquet_page_index.py | 371 ++++++++++++++++++
 tests/query_test/test_parquet_page_index.py     | 372 -------------------
 5 files changed, 463 insertions(+), 444 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/de7f09d7/be/src/common/global-flags.cc
----------------------------------------------------------------------
diff --git a/be/src/common/global-flags.cc b/be/src/common/global-flags.cc
index d6f1d04..0473352 100644
--- a/be/src/common/global-flags.cc
+++ b/be/src/common/global-flags.cc
@@ -239,6 +239,12 @@ DEFINE_double_hidden(invalidate_tables_fraction_on_memory_pressure, 0.1,
     "The fraction of tables to invalidate when CatalogdTableInvalidator considers the "
     "old GC generation to be almost full.");
 
+DEFINE_bool_hidden(enable_parquet_page_index_writing_debug_only, false, "If true, Impala "
+    "will write the Parquet page index. It is not advised to use it in a production "
+    "environment, only for testing and development. This flag is meant to be temporary. "
+    "We plan to remove this flag once Impala is able to read the page index and has "
+    "better test coverage around it.");
+
 // ++========================++
 // || Startup flag graveyard ||
 // ++========================++

http://git-wip-us.apache.org/repos/asf/impala/blob/de7f09d7/be/src/exec/hdfs-parquet-table-writer.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-table-writer.cc b/be/src/exec/hdfs-parquet-table-writer.cc
index 8aa4f7a..8a8add6 100644
--- a/be/src/exec/hdfs-parquet-table-writer.cc
+++ b/be/src/exec/hdfs-parquet-table-writer.cc
@@ -83,6 +83,8 @@ using namespace apache::thrift;
 // the columns and run that function over row batches.
 // TODO: we need to pass in the compression from the FE/metadata
 
+DECLARE_bool(enable_parquet_page_index_writing_debug_only);
+
 namespace impala {
 
 // Base class for column writers. This contains most of the logic except for
@@ -163,12 +165,12 @@ class HdfsParquetTableWriter::BaseColumnWriter {
   // Any data for previous row groups must be reset (e.g. dictionaries).
   // Subclasses must call this if they override this function.
   virtual void Reset() {
-    num_data_pages_ = 0;
-    current_page_ = nullptr;
     num_values_ = 0;
     total_compressed_byte_size_ = 0;
     current_encoding_ = parquet::Encoding::PLAIN;
     next_page_encoding_ = parquet::Encoding::PLAIN;
+    pages_.clear();
+    current_page_ = nullptr;
     column_encodings_.clear();
     dict_encoding_stats_.clear();
     data_encoding_stats_.clear();
@@ -205,6 +207,58 @@ class HdfsParquetTableWriter::BaseColumnWriter {
  protected:
   friend class HdfsParquetTableWriter;
 
+  Status AddMemoryConsumptionForPageIndex(int64_t new_memory_allocation) {
+    if (UNLIKELY(!table_sink_mem_tracker_->TryConsume(new_memory_allocation))) {
+      return table_sink_mem_tracker_->MemLimitExceeded(parent_->state_,
+          "Failed to allocate memory for Parquet page index.", new_memory_allocation);
+    }
+    page_index_memory_consumption_ += new_memory_allocation;
+    return Status::OK();
+  }
+
+  Status ReserveOffsetIndex(int64_t capacity) {
+    if (!FLAGS_enable_parquet_page_index_writing_debug_only) return Status::OK();
+    RETURN_IF_ERROR(
+        AddMemoryConsumptionForPageIndex(capacity * sizeof(parquet::PageLocation)));
+    offset_index_.page_locations.reserve(capacity);
+    return Status::OK();
+  }
+
+  void AddLocationToOffsetIndex(const parquet::PageLocation& location) {
+    if (!FLAGS_enable_parquet_page_index_writing_debug_only) return;
+    offset_index_.page_locations.push_back(location);
+  }
+
+  Status AddPageStatsToColumnIndex() {
+    if (!FLAGS_enable_parquet_page_index_writing_debug_only) return Status::OK();
+    parquet::Statistics page_stats;
+    page_stats_base_->EncodeToThrift(&page_stats);
+    // If pages_stats contains min_value and max_value, then append them to min_values_
+    // and max_values_ and also mark the page as not null. In case min and max values are
+    // not set, push empty strings to maintain the consistency of the index and mark the
+    // page as null. Always push the null_count.
+    string min_val;
+    string max_val;
+    if ((page_stats.__isset.min_value) && (page_stats.__isset.max_value)) {
+      Status s_min = TruncateDown(page_stats.min_value, PAGE_INDEX_MAX_STRING_LENGTH,
+          &min_val);
+      Status s_max = TruncateUp(page_stats.max_value, PAGE_INDEX_MAX_STRING_LENGTH,
+          &max_val);
+      if (!s_min.ok() || !s_max.ok()) valid_column_index_ = false;
+      column_index_.null_pages.push_back(false);
+    } else {
+      DCHECK(!page_stats.__isset.min_value && !page_stats.__isset.max_value);
+      column_index_.null_pages.push_back(true);
+      DCHECK_EQ(page_stats.null_count, num_values_);
+    }
+    RETURN_IF_ERROR(
+        AddMemoryConsumptionForPageIndex(min_val.capacity() + max_val.capacity()));
+    column_index_.min_values.emplace_back(std::move(min_val));
+    column_index_.max_values.emplace_back(std::move(max_val));
+    column_index_.null_counts.push_back(page_stats.null_count);
+    return Status::OK();
+  }
+
   // Encodes value into the current page output buffer and updates the column statistics
   // aggregates. Returns true if the value was appended successfully to the current page.
   // Returns false if the value was not appended to the current page and the caller can
@@ -254,12 +308,6 @@ class HdfsParquetTableWriter::BaseColumnWriter {
   // compressed.
   scoped_ptr<Codec> compressor_;
 
-  vector<DataPage> pages_;
-
-  // Number of pages in 'pages_' that are used.  'pages_' is reused between flushes
-  // so this number can be less than pages_.size()
-  int num_data_pages_;
-
   // Size of newly created pages. Defaults to DEFAULT_DATA_PAGE_SIZE and is increased
   // when pages are not big enough. This only happens when there are enough unique values
   // such that we switch from PLAIN_DICTIONARY to PLAIN encoding and then have very
@@ -267,6 +315,10 @@ class HdfsParquetTableWriter::BaseColumnWriter {
   // TODO: Consider removing and only creating a single large page as necessary.
   int64_t page_size_;
 
+  // Pages belong to this column chunk. We need to keep them in memory in order to write
+  // them together.
+  vector<DataPage> pages_;
+
   // Pointer to the current page in 'pages_'. Not owned.
   DataPage* current_page_;
 
@@ -645,11 +697,10 @@ Status HdfsParquetTableWriter::BaseColumnWriter::Flush(int64_t* file_pos,
 
   *first_data_page = *file_pos;
   int64_t current_row_group_index = 0;
-  offset_index_.page_locations.resize(num_data_pages_);
+  RETURN_IF_ERROR(ReserveOffsetIndex(pages_.size()));
 
   // Write data pages
-  for (int i = 0; i < num_data_pages_; ++i) {
-    DataPage& page = pages_[i];
+  for (const DataPage& page : pages_) {
     parquet::PageLocation location;
 
     if (page.header.data_page_header.num_values == 0) {
@@ -657,7 +708,7 @@ Status HdfsParquetTableWriter::BaseColumnWriter::Flush(int64_t* file_pos,
       location.offset = -1;
       location.compressed_page_size = 0;
       location.first_row_index = -1;
-      offset_index_.page_locations[i] = location;
+      AddLocationToOffsetIndex(location);
       continue;
     }
 
@@ -677,7 +728,7 @@ Status HdfsParquetTableWriter::BaseColumnWriter::Flush(int64_t* file_pos,
     // its name suggests. On the other hand, parquet::PageLocation::compressed_page_size
     // also includes the size of the page header.
     location.compressed_page_size = page.header.compressed_page_size + len;
-    offset_index_.page_locations[i] = location;
+    AddLocationToOffsetIndex(location);
 
     // Write the page data
     RETURN_IF_ERROR(parent_->Write(page.data, page.header.compressed_page_size));
@@ -754,37 +805,7 @@ Status HdfsParquetTableWriter::BaseColumnWriter::FinalizeCurrentPage() {
   }
 
   DCHECK(page_stats_base_ != nullptr);
-  parquet::Statistics page_stats;
-  page_stats_base_->EncodeToThrift(&page_stats);
-  {
-    // If pages_stats contains min_value and max_value, then append them to min_values_
-    // and max_values_ and also mark the page as not null. In case min and max values are
-    // not set, push empty strings to maintain the consistency of the index and mark the
-    // page as null. Always push the null_count.
-    string min_val;
-    string max_val;
-    if ((page_stats.__isset.min_value) && (page_stats.__isset.max_value)) {
-      Status s_min = TruncateDown(page_stats.min_value, PAGE_INDEX_MAX_STRING_LENGTH,
-          &min_val);
-      Status s_max = TruncateUp(page_stats.max_value, PAGE_INDEX_MAX_STRING_LENGTH,
-          &max_val);
-      if (!s_min.ok() || !s_max.ok()) valid_column_index_ = false;
-      column_index_.null_pages.push_back(false);
-    } else {
-      DCHECK(!page_stats.__isset.min_value && !page_stats.__isset.max_value);
-      column_index_.null_pages.push_back(true);
-      DCHECK_EQ(page_stats.null_count, num_values_);
-    }
-    int64_t new_memory_allocation = min_val.capacity() + max_val.capacity();
-    if (UNLIKELY(!table_sink_mem_tracker_->TryConsume(new_memory_allocation))) {
-      return table_sink_mem_tracker_->MemLimitExceeded(parent_->state_,
-          "Failed to allocate memory for Parquet page index.", new_memory_allocation);
-    }
-    page_index_memory_consumption_ += new_memory_allocation;
-    column_index_.min_values.emplace_back(std::move(min_val));
-    column_index_.max_values.emplace_back(std::move(max_val));
-    column_index_.null_counts.push_back(page_stats.null_count);
-  }
+  RETURN_IF_ERROR(AddPageStatsToColumnIndex());
 
   // Update row group statistics from page statistics.
   DCHECK(row_group_stats_base_ != nullptr);
@@ -805,25 +826,17 @@ Status HdfsParquetTableWriter::BaseColumnWriter::FinalizeCurrentPage() {
 }
 
 void HdfsParquetTableWriter::BaseColumnWriter::NewPage() {
-  if (num_data_pages_ < pages_.size()) {
-    // Reuse an existing page
-    current_page_ = &pages_[num_data_pages_++];
-    current_page_->header.data_page_header.num_values = 0;
-    current_page_->header.compressed_page_size = 0;
-    current_page_->header.uncompressed_page_size = 0;
-  } else {
-    pages_.push_back(DataPage());
-    current_page_ = &pages_[num_data_pages_++];
-
-    parquet::DataPageHeader header;
-    header.num_values = 0;
-    // The code that populates the column chunk metadata's encodings field
-    // relies on these specific values for the definition/repetition level
-    // encodings.
-    header.definition_level_encoding = parquet::Encoding::RLE;
-    header.repetition_level_encoding = parquet::Encoding::RLE;
-    current_page_->header.__set_data_page_header(header);
-  }
+  pages_.push_back(DataPage());
+  current_page_ = &pages_.back();
+
+  parquet::DataPageHeader header;
+  header.num_values = 0;
+  // The code that populates the column chunk metadata's encodings field
+  // relies on these specific values for the definition/repetition level
+  // encodings.
+  header.definition_level_encoding = parquet::Encoding::RLE;
+  header.repetition_level_encoding = parquet::Encoding::RLE;
+  current_page_->header.__set_data_page_header(header);
   current_encoding_ = next_page_encoding_;
   current_page_->finalized = false;
   current_page_->num_non_null = 0;
@@ -1137,6 +1150,7 @@ Status HdfsParquetTableWriter::Finalize() {
 
   RETURN_IF_ERROR(FlushCurrentRowGroup());
   RETURN_IF_ERROR(WritePageIndex());
+  for (auto& column : columns_) column->Reset();
   RETURN_IF_ERROR(WriteFileFooter());
   stats_.__set_parquet_stats(parquet_insert_stats_);
   COUNTER_ADD(parent_->rows_inserted_counter(), row_count_);
@@ -1249,6 +1263,8 @@ Status HdfsParquetTableWriter::FlushCurrentRowGroup() {
 }
 
 Status HdfsParquetTableWriter::WritePageIndex() {
+  if (!FLAGS_enable_parquet_page_index_writing_debug_only) return Status::OK();
+
   // Currently Impala only write Parquet files with a single row group. The current
   // page index logic depends on this behavior as it only keeps one row group's
   // statistics in memory.
@@ -1284,8 +1300,6 @@ Status HdfsParquetTableWriter::WritePageIndex() {
     row_group->columns[i].__set_offset_index_length(len);
     file_pos_ += len;
   }
-  // Reset column writers.
-  for (auto& column : columns_) column->Reset();
   return Status::OK();
 }
 

http://git-wip-us.apache.org/repos/asf/impala/blob/de7f09d7/testdata/workloads/functional-query/queries/QueryTest/stats-extrapolation.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/stats-extrapolation.test b/testdata/workloads/functional-query/queries/QueryTest/stats-extrapolation.test
index 8e95168..3b25427 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/stats-extrapolation.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/stats-extrapolation.test
@@ -33,17 +33,17 @@ show table stats alltypes
 YEAR, MONTH, #ROWS, EXTRAP #ROWS, #FILES, SIZE, BYTES CACHED, CACHE REPLICATION, FORMAT, INCREMENTAL STATS, LOCATION
 ---- RESULTS
 '2009','1',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=1'
-'2009','2',-1,289,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=2'
-'2009','3',-1,307,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=3'
+'2009','2',-1,288,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=2'
+'2009','3',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=3'
 '2009','4',-1,302,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=4'
-'2009','5',-1,307,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=5'
+'2009','5',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=5'
 '2009','6',-1,302,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=6'
-'2009','7',-1,307,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=7'
-'2009','8',-1,307,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=8'
+'2009','7',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=7'
+'2009','8',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=8'
 '2009','9',-1,302,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=9'
-'2009','10',-1,307,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=10'
+'2009','10',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=10'
 '2009','11',-1,302,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=11'
-'2009','12',-1,307,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=12'
+'2009','12',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=12'
 'Total','',3650,3650,12,regex:.*B,'0B','','','',''
 ---- TYPES
 STRING,STRING,BIGINT,BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING

http://git-wip-us.apache.org/repos/asf/impala/blob/de7f09d7/tests/custom_cluster/test_parquet_page_index.py
----------------------------------------------------------------------
diff --git a/tests/custom_cluster/test_parquet_page_index.py b/tests/custom_cluster/test_parquet_page_index.py
new file mode 100644
index 0000000..0d2a750
--- /dev/null
+++ b/tests/custom_cluster/test_parquet_page_index.py
@@ -0,0 +1,371 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Targeted Impala insert tests
+
+import os
+
+from collections import namedtuple
+from subprocess import check_call
+from parquet.ttypes import BoundaryOrder, ColumnIndex, OffsetIndex, PageHeader, PageType
+
+from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
+from tests.common.skip import SkipIfLocal
+from tests.util.filesystem_utils import get_fs_path
+from tests.util.get_parquet_metadata import (
+    decode_stats_value,
+    get_parquet_metadata,
+    read_serialized_object
+)
+
+PAGE_INDEX_MAX_STRING_LENGTH = 64
+
+
+@SkipIfLocal.parquet_file_size
+class TestHdfsParquetTableIndexWriter(CustomClusterTestSuite):
+  """Since PARQUET-922 page statistics can be written before the footer.
+  The tests in this class checks if Impala writes the page indices correctly.
+  It is temporarily a custom cluster test suite because we need to set the
+  enable_parquet_page_index_writing command-line flag for the Impala daemon
+  in order to make it write the page index.
+  TODO: IMPALA-5843 Once Impala is able to read the page index and also write it by
+  default, this test suite should be moved back to query tests.
+  """
+  @classmethod
+  def get_workload(cls):
+    return 'functional-query'
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(CustomClusterTestSuite, cls).add_test_dimensions()
+    cls.ImpalaTestMatrix.add_constraint(
+        lambda v: v.get_value('table_format').file_format == 'parquet')
+
+  def _get_row_group_from_file(self, parquet_file):
+    """Returns namedtuples that contain the schema, stats, offset_index, column_index,
+    and page_headers for each column in the first row group in file 'parquet_file'. Fails
+    if the file contains multiple row groups.
+    """
+    ColumnInfo = namedtuple('ColumnInfo', ['schema', 'stats', 'offset_index',
+        'column_index', 'page_headers'])
+
+    file_meta_data = get_parquet_metadata(parquet_file)
+    assert len(file_meta_data.row_groups) == 1
+    # We only support flat schemas, the additional element is the root element.
+    schemas = file_meta_data.schema[1:]
+    row_group = file_meta_data.row_groups[0]
+    assert len(schemas) == len(row_group.columns)
+    row_group_index = []
+    with open(parquet_file) as file_handle:
+      for column, schema in zip(row_group.columns, schemas):
+        column_index_offset = column.column_index_offset
+        column_index_length = column.column_index_length
+        column_index = None
+        if column_index_offset and column_index_length:
+          column_index = read_serialized_object(ColumnIndex, file_handle,
+                                                column_index_offset, column_index_length)
+        column_meta_data = column.meta_data
+        stats = None
+        if column_meta_data:
+          stats = column_meta_data.statistics
+
+        offset_index_offset = column.offset_index_offset
+        offset_index_length = column.offset_index_length
+        offset_index = None
+        page_headers = []
+        if offset_index_offset and offset_index_length:
+          offset_index = read_serialized_object(OffsetIndex, file_handle,
+                                                offset_index_offset, offset_index_length)
+          for page_loc in offset_index.page_locations:
+            page_header = read_serialized_object(PageHeader, file_handle, page_loc.offset,
+                                                 page_loc.compressed_page_size)
+            page_headers.append(page_header)
+
+        column_info = ColumnInfo(schema, stats, offset_index, column_index, page_headers)
+        row_group_index.append(column_info)
+    return row_group_index
+
+  def _get_row_groups_from_hdfs_folder(self, hdfs_path, tmpdir):
+    """Returns a list of column infos (containing the schema, stats, offset_index,
+    column_index, and page_headers) for the first row group in all parquet files in
+    'hdfs_path'.
+    """
+    row_group_indexes = []
+    check_call(['hdfs', 'dfs', '-get', hdfs_path, tmpdir.strpath])
+    for root, subdirs, files in os.walk(tmpdir.strpath):
+      for f in files:
+        parquet_file = os.path.join(root, str(f))
+        row_group_indexes.append(self._get_row_group_from_file(parquet_file))
+    return row_group_indexes
+
+  def _validate_page_locations(self, page_locations):
+    """Validate that the page locations are in order."""
+    for previous_loc, current_loc in zip(page_locations[:-1], page_locations[1:]):
+      assert previous_loc.offset < current_loc.offset
+      assert previous_loc.first_row_index < current_loc.first_row_index
+
+  def _validate_null_stats(self, index_size, column_info):
+    """Validates the statistics stored in null_pages and null_counts."""
+    column_index = column_info.column_index
+    column_stats = column_info.stats
+    assert column_index.null_pages is not None
+    assert len(column_index.null_pages) == index_size
+    assert column_index.null_counts is not None
+    assert len(column_index.null_counts) == index_size
+
+    for page_is_null, null_count, page_header in zip(column_index.null_pages,
+        column_index.null_counts, column_info.page_headers):
+      assert page_header.type == PageType.DATA_PAGE
+      num_values = page_header.data_page_header.num_values
+      assert not page_is_null or null_count == num_values
+
+    if column_stats:
+      assert column_stats.null_count == sum(column_index.null_counts)
+
+  def _validate_min_max_values(self, index_size, column_info):
+    """Validate min/max values of the pages in a column chunk."""
+    column_index = column_info.column_index
+    min_values = column_info.column_index.min_values
+    assert len(min_values) == index_size
+    max_values = column_info.column_index.max_values
+    assert len(max_values) == index_size
+
+    if not column_info.stats:
+      return
+
+    column_min_value_str = column_info.stats.min_value
+    column_max_value_str = column_info.stats.max_value
+    if column_min_value_str is None or column_max_value_str is None:
+      # If either is None, then both need to be None.
+      assert column_min_value_str is None and column_max_value_str is None
+      # No min and max value, all pages need to be null
+      for idx, null_page in enumerate(column_index.null_pages):
+        assert null_page, "Page {} of column {} is not null, \
+            but doesn't have min and max values!".format(idx, column_index.schema.name)
+      # Everything is None, no further checks needed.
+      return
+
+    column_min_value = decode_stats_value(column_info.schema, column_min_value_str)
+    for null_page, page_min_str in zip(column_index.null_pages, min_values):
+      if not null_page:
+        page_min_value = decode_stats_value(column_info.schema, page_min_str)
+        # If type is str, page_min_value might have been truncated.
+        if isinstance(page_min_value, basestring):
+          assert page_min_value >= column_min_value[:len(page_min_value)]
+        else:
+          assert page_min_value >= column_min_value
+
+    column_max_value = decode_stats_value(column_info.schema, column_max_value_str)
+    for null_page, page_max_str in zip(column_index.null_pages, max_values):
+      if not null_page:
+        page_max_value = decode_stats_value(column_info.schema, page_max_str)
+        # If type is str, page_max_value might have been truncated and incremented.
+        if (isinstance(page_max_value, basestring) and
+            len(page_max_value) == PAGE_INDEX_MAX_STRING_LENGTH):
+          max_val_prefix = page_max_value.rstrip('\0')
+          assert max_val_prefix[:-1] <= column_max_value
+        else:
+          assert page_max_value <= column_max_value
+
+  def _validate_ordering(self, ordering, schema, null_pages, min_values, max_values):
+    """Check if the ordering of the values reflects the value of 'ordering'."""
+
+    def is_sorted(l, reverse=False):
+      if not reverse:
+        return all(a <= b for a, b in zip(l, l[1:]))
+      else:
+        return all(a >= b for a, b in zip(l, l[1:]))
+
+    # Filter out null pages and decode the actual min/max values.
+    actual_min_values = [decode_stats_value(schema, min_val)
+                         for min_val, is_null in zip(min_values, null_pages)
+                         if not is_null]
+    actual_max_values = [decode_stats_value(schema, max_val)
+                         for max_val, is_null in zip(max_values, null_pages)
+                         if not is_null]
+
+    # For ASCENDING and DESCENDING, both min and max values need to be sorted.
+    if ordering == BoundaryOrder.ASCENDING:
+      assert is_sorted(actual_min_values)
+      assert is_sorted(actual_max_values)
+    elif ordering == BoundaryOrder.DESCENDING:
+      assert is_sorted(actual_min_values, reverse=True)
+      assert is_sorted(actual_max_values, reverse=True)
+    else:
+      assert ordering == BoundaryOrder.UNORDERED
+      # For UNORDERED, min and max values cannot be both sorted.
+      assert not is_sorted(actual_min_values) or not is_sorted(actual_max_values)
+      assert (not is_sorted(actual_min_values, reverse=True) or
+              not is_sorted(actual_max_values, reverse=True))
+
+  def _validate_boundary_order(self, column_info):
+    """Validate that min/max values are really in the order specified by
+    boundary order.
+    """
+    column_index = column_info.column_index
+    self._validate_ordering(column_index.boundary_order, column_info.schema,
+        column_index.null_pages, column_index.min_values, column_index.max_values)
+
+  def _validate_parquet_page_index(self, hdfs_path, tmpdir):
+    """Validates that 'hdfs_path' contains exactly one parquet file and that the rowgroup
+    index in that file is in the valid format.
+    """
+    row_group_indexes = self._get_row_groups_from_hdfs_folder(hdfs_path, tmpdir)
+    for columns in row_group_indexes:
+      for column_info in columns:
+        try:
+          index_size = len(column_info.offset_index.page_locations)
+          assert index_size > 0
+          self._validate_page_locations(column_info.offset_index.page_locations)
+          # IMPALA-7304: Impala doesn't write column index for floating-point columns
+          # until PARQUET-1222 is resolved.
+          if column_info.schema.type in [4, 5]:
+            assert column_info.column_index is None
+            continue
+          self._validate_null_stats(index_size, column_info)
+          self._validate_min_max_values(index_size, column_info)
+          self._validate_boundary_order(column_info)
+        except AssertionError as e:
+          e.args += ("Validation failed on column {}.".format(column_info.schema.name),)
+          raise
+
+  def _ctas_table_and_verify_index(self, vector, unique_database, source_table,
+                                   tmpdir, sorting_column=None):
+    """Copies 'source_table' into a parquet table and makes sure that the index
+    in the resulting parquet file is valid.
+    """
+    table_name = "test_hdfs_parquet_table_writer"
+    qualified_table_name = "{0}.{1}".format(unique_database, table_name)
+    hdfs_path = get_fs_path('/test-warehouse/{0}.db/{1}/'.format(unique_database,
+                                                                 table_name))
+    # Setting num_nodes = 1 ensures that the query is executed on the coordinator,
+    # resulting in a single parquet file being written.
+    vector.get_value('exec_option')['num_nodes'] = 1
+    self.execute_query("drop table if exists {0}".format(qualified_table_name))
+    if sorting_column is None:
+      query = ("create table {0} stored as parquet as select * from {1}").format(
+          qualified_table_name, source_table)
+    else:
+      query = ("create table {0} sort by({1}) stored as parquet as select * from {2}"
+               ).format(qualified_table_name, sorting_column, source_table)
+    self.execute_query(query, vector.get_value('exec_option'))
+    self._validate_parquet_page_index(hdfs_path, tmpdir.join(source_table))
+
+  def _create_string_table_with_values(self, vector, unique_database, table_name,
+                                       values_sql):
+    """Creates a parquet table that has a single string column, then invokes an insert
+    statement on it with the 'values_sql' parameter. E.g. 'values_sql' is "('asdf')".
+    It returns the HDFS path for the table.
+    """
+    qualified_table_name = "{0}.{1}".format(unique_database, table_name)
+    self.execute_query("drop table if exists {0}".format(qualified_table_name))
+    vector.get_value('exec_option')['num_nodes'] = 1
+    query = ("create table {0} (str string) stored as parquet").format(
+        qualified_table_name)
+    self.execute_query(query, vector.get_value('exec_option'))
+    self.execute_query("insert into {0} values {1}".format(qualified_table_name,
+        values_sql), vector.get_value('exec_option'))
+    return get_fs_path('/test-warehouse/{0}.db/{1}/'.format(unique_database,
+        table_name))
+
+  @CustomClusterTestSuite.with_args("--enable_parquet_page_index_writing_debug_only")
+  def test_ctas_tables(self, vector, unique_database, tmpdir):
+    """Test different Parquet files created via CTAS statements."""
+
+    # Test that writing a parquet file populates the rowgroup indexes with the correct
+    # values.
+    self._ctas_table_and_verify_index(vector, unique_database, "functional.alltypes",
+        tmpdir)
+
+    # Test that writing a parquet file populates the rowgroup indexes with the correct
+    # values, using decimal types.
+    self._ctas_table_and_verify_index(vector, unique_database, "functional.decimal_tbl",
+        tmpdir)
+
+    # Test that writing a parquet file populates the rowgroup indexes with the correct
+    # values, using char types.
+    self._ctas_table_and_verify_index(vector, unique_database, "functional.chars_formats",
+        tmpdir)
+
+    # Test that we don't write min/max values in the index for null columns.
+    # Ensure null_count is set for columns with null values.
+    self._ctas_table_and_verify_index(vector, unique_database, "functional.nulltable",
+        tmpdir)
+
+    # Test that when a ColumnChunk is written across multiple pages, the index is
+    # valid.
+    self._ctas_table_and_verify_index(vector, unique_database, "tpch.customer",
+        tmpdir)
+    self._ctas_table_and_verify_index(vector, unique_database, "tpch.orders",
+        tmpdir)
+
+    # Test that when the schema has a sorting column, the index is valid.
+    self._ctas_table_and_verify_index(vector, unique_database,
+        "functional_parquet.zipcode_incomes", tmpdir, "id")
+
+    # Test table with wide row.
+    self._ctas_table_and_verify_index(vector, unique_database,
+        "functional_parquet.widerow", tmpdir)
+
+    # Test tables with wide rows and many columns.
+    self._ctas_table_and_verify_index(vector, unique_database,
+        "functional_parquet.widetable_250_cols", tmpdir)
+    self._ctas_table_and_verify_index(vector, unique_database,
+        "functional_parquet.widetable_500_cols", tmpdir)
+    self._ctas_table_and_verify_index(vector, unique_database,
+        "functional_parquet.widetable_1000_cols", tmpdir)
+
+  @CustomClusterTestSuite.with_args("--enable_parquet_page_index_writing_debug_only")
+  def test_max_string_values(self, vector, unique_database, tmpdir):
+    """Test string values that are all 0xFFs or end with 0xFFs."""
+
+    # String value is all of 0xFFs but its length is less than PAGE_INDEX_TRUNCATE_LENGTH.
+    short_tbl = "short_tbl"
+    short_hdfs_path = self._create_string_table_with_values(vector, unique_database,
+        short_tbl, "(rpad('', {0}, chr(255)))".format(PAGE_INDEX_MAX_STRING_LENGTH - 1))
+    self._validate_parquet_page_index(short_hdfs_path, tmpdir.join(short_tbl))
+
+    # String value is all of 0xFFs and its length is PAGE_INDEX_TRUNCATE_LENGTH.
+    fit_tbl = "fit_tbl"
+    fit_hdfs_path = self._create_string_table_with_values(vector, unique_database,
+        fit_tbl, "(rpad('', {0}, chr(255)))".format(PAGE_INDEX_MAX_STRING_LENGTH))
+    self._validate_parquet_page_index(fit_hdfs_path, tmpdir.join(fit_tbl))
+
+    # All bytes are 0xFFs and the string is longer then PAGE_INDEX_TRUNCATE_LENGTH, so we
+    # should not write page statistics.
+    too_long_tbl = "too_long_tbl"
+    too_long_hdfs_path = self._create_string_table_with_values(vector, unique_database,
+        too_long_tbl, "(rpad('', {0}, chr(255)))".format(
+            PAGE_INDEX_MAX_STRING_LENGTH + 1))
+    row_group_indexes = self._get_row_groups_from_hdfs_folder(too_long_hdfs_path,
+        tmpdir.join(too_long_tbl))
+    column = row_group_indexes[0][0]
+    assert column.column_index is None
+    # We always write the offset index
+    assert column.offset_index is not None
+
+    # Test string with value that starts with 'aaa' following with 0xFFs and its length is
+    # greater than PAGE_INDEX_TRUNCATE_LENGTH. Max value should be 'aab'.
+    aaa_tbl = "aaa_tbl"
+    aaa_hdfs_path = self._create_string_table_with_values(vector, unique_database,
+        aaa_tbl, "(rpad('aaa', {0}, chr(255)))".format(PAGE_INDEX_MAX_STRING_LENGTH + 1))
+    row_group_indexes = self._get_row_groups_from_hdfs_folder(aaa_hdfs_path,
+        tmpdir.join(aaa_tbl))
+    column = row_group_indexes[0][0]
+    assert len(column.column_index.max_values) == 1
+    max_value = column.column_index.max_values[0]
+    assert max_value == 'aab'

http://git-wip-us.apache.org/repos/asf/impala/blob/de7f09d7/tests/query_test/test_parquet_page_index.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_parquet_page_index.py b/tests/query_test/test_parquet_page_index.py
deleted file mode 100644
index 6235819..0000000
--- a/tests/query_test/test_parquet_page_index.py
+++ /dev/null
@@ -1,372 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Targeted Impala insert tests
-
-import os
-
-from collections import namedtuple
-from subprocess import check_call
-from parquet.ttypes import BoundaryOrder, ColumnIndex, OffsetIndex, PageHeader, PageType
-
-from tests.common.impala_test_suite import ImpalaTestSuite
-from tests.common.skip import SkipIfLocal
-from tests.util.filesystem_utils import get_fs_path
-from tests.util.get_parquet_metadata import (
-    decode_stats_value,
-    get_parquet_metadata,
-    read_serialized_object
-)
-
-PAGE_INDEX_MAX_STRING_LENGTH = 64
-
-
-@SkipIfLocal.parquet_file_size
-class TestHdfsParquetTableIndexWriter(ImpalaTestSuite):
-  """Since PARQUET-922 page statistics can be written before the footer.
-  The tests in this class checks if Impala writes the page indices correctly.
-  """
-  @classmethod
-  def get_workload(cls):
-    return 'functional-query'
-
-  @classmethod
-  def add_test_dimensions(cls):
-    super(TestHdfsParquetTableIndexWriter, cls).add_test_dimensions()
-    cls.ImpalaTestMatrix.add_constraint(
-        lambda v: v.get_value('table_format').file_format == 'parquet')
-
-  def _get_row_group_from_file(self, parquet_file):
-    """Returns namedtuples that contain the schema, stats, offset_index, column_index,
-    and page_headers for each column in the first row group in file 'parquet_file'. Fails
-    if the file contains multiple row groups.
-    """
-    ColumnInfo = namedtuple('ColumnInfo', ['schema', 'stats', 'offset_index',
-        'column_index', 'page_headers'])
-
-    file_meta_data = get_parquet_metadata(parquet_file)
-    assert len(file_meta_data.row_groups) == 1
-    # We only support flat schemas, the additional element is the root element.
-    schemas = file_meta_data.schema[1:]
-    row_group = file_meta_data.row_groups[0]
-    assert len(schemas) == len(row_group.columns)
-    row_group_index = []
-    with open(parquet_file) as file_handle:
-      for column, schema in zip(row_group.columns, schemas):
-        column_index_offset = column.column_index_offset
-        column_index_length = column.column_index_length
-        column_index = None
-        if column_index_offset and column_index_length:
-          column_index = read_serialized_object(ColumnIndex, file_handle,
-                                                column_index_offset, column_index_length)
-        column_meta_data = column.meta_data
-        stats = None
-        if column_meta_data:
-          stats = column_meta_data.statistics
-
-        offset_index_offset = column.offset_index_offset
-        offset_index_length = column.offset_index_length
-        offset_index = None
-        page_headers = []
-        if offset_index_offset and offset_index_length:
-          offset_index = read_serialized_object(OffsetIndex, file_handle,
-                                                offset_index_offset, offset_index_length)
-          for page_loc in offset_index.page_locations:
-            page_header = read_serialized_object(PageHeader, file_handle, page_loc.offset,
-                                                 page_loc.compressed_page_size)
-            page_headers.append(page_header)
-
-        column_info = ColumnInfo(schema, stats, offset_index, column_index, page_headers)
-        row_group_index.append(column_info)
-    return row_group_index
-
-  def _get_row_groups_from_hdfs_folder(self, hdfs_path, tmpdir):
-    """Returns a list of column infos (containing the schema, stats, offset_index,
-    column_index, and page_headers) for the first row group in all parquet files in
-    'hdfs_path'.
-    """
-    row_group_indexes = []
-    check_call(['hdfs', 'dfs', '-get', hdfs_path, tmpdir.strpath])
-    for root, subdirs, files in os.walk(tmpdir.strpath):
-      for f in files:
-        parquet_file = os.path.join(root, str(f))
-        row_group_indexes.append(self._get_row_group_from_file(parquet_file))
-    return row_group_indexes
-
-  def _validate_page_locations(self, page_locations):
-    """Validate that the page locations are in order."""
-    for previous_loc, current_loc in zip(page_locations[:-1], page_locations[1:]):
-      assert previous_loc.offset < current_loc.offset
-      assert previous_loc.first_row_index < current_loc.first_row_index
-
-  def _validate_null_stats(self, index_size, column_info):
-    """Validates the statistics stored in null_pages and null_counts."""
-    column_index = column_info.column_index
-    column_stats = column_info.stats
-    assert column_index.null_pages is not None
-    assert len(column_index.null_pages) == index_size
-    assert column_index.null_counts is not None
-    assert len(column_index.null_counts) == index_size
-
-    for page_is_null, null_count, page_header in zip(column_index.null_pages,
-        column_index.null_counts, column_info.page_headers):
-      assert page_header.type == PageType.DATA_PAGE
-      num_values = page_header.data_page_header.num_values
-      assert not page_is_null or null_count == num_values
-
-    if column_stats:
-      assert column_stats.null_count == sum(column_index.null_counts)
-
-  def _validate_min_max_values(self, index_size, column_info):
-    """Validate min/max values of the pages in a column chunk."""
-    column_index = column_info.column_index
-    min_values = column_info.column_index.min_values
-    assert len(min_values) == index_size
-    max_values = column_info.column_index.max_values
-    assert len(max_values) == index_size
-
-    if not column_info.stats:
-      return
-
-    column_min_value_str = column_info.stats.min_value
-    column_max_value_str = column_info.stats.max_value
-    if column_min_value_str is None or column_max_value_str is None:
-      # If either is None, then both need to be None.
-      assert column_min_value_str is None and column_max_value_str is None
-      # No min and max value, all pages need to be null
-      for idx, null_page in enumerate(column_index.null_pages):
-        assert null_page, "Page {} of column {} is not null, \
-            but doesn't have min and max values!".format(idx, column_index.schema.name)
-      # Everything is None, no further checks needed.
-      return
-
-    column_min_value = decode_stats_value(column_info.schema, column_min_value_str)
-    for null_page, page_min_str in zip(column_index.null_pages, min_values):
-      if not null_page:
-        page_min_value = decode_stats_value(column_info.schema, page_min_str)
-        # If type is str, page_min_value might have been truncated.
-        if isinstance(page_min_value, basestring):
-          assert page_min_value >= column_min_value[:len(page_min_value)]
-        else:
-          assert page_min_value >= column_min_value
-
-    column_max_value = decode_stats_value(column_info.schema, column_max_value_str)
-    for null_page, page_max_str in zip(column_index.null_pages, max_values):
-      if not null_page:
-        page_max_value = decode_stats_value(column_info.schema, page_max_str)
-        # If type is str, page_max_value might have been truncated and incremented.
-        if (isinstance(page_max_value, basestring) and
-            len(page_max_value) == PAGE_INDEX_MAX_STRING_LENGTH):
-          max_val_prefix = page_max_value.rstrip('\0')
-          assert max_val_prefix[:-1] <= column_max_value
-        else:
-          assert page_max_value <= column_max_value
-
-  def _validate_ordering(self, ordering, schema, null_pages, min_values, max_values):
-    """Check if the ordering of the values reflects the value of 'ordering'."""
-
-    def is_sorted(l, reverse=False):
-      if not reverse:
-        return all(a <= b for a, b in zip(l, l[1:]))
-      else:
-        return all(a >= b for a, b in zip(l, l[1:]))
-
-    # Filter out null pages and decode the actual min/max values.
-    actual_min_values = [decode_stats_value(schema, min_val)
-                         for min_val, is_null in zip(min_values, null_pages)
-                         if not is_null]
-    actual_max_values = [decode_stats_value(schema, max_val)
-                         for max_val, is_null in zip(max_values, null_pages)
-                         if not is_null]
-
-    # For ASCENDING and DESCENDING, both min and max values need to be sorted.
-    if ordering == BoundaryOrder.ASCENDING:
-      assert is_sorted(actual_min_values)
-      assert is_sorted(actual_max_values)
-    elif ordering == BoundaryOrder.DESCENDING:
-      assert is_sorted(actual_min_values, reverse=True)
-      assert is_sorted(actual_max_values, reverse=True)
-    else:
-      assert ordering == BoundaryOrder.UNORDERED
-      # For UNORDERED, min and max values cannot be both sorted.
-      assert not is_sorted(actual_min_values) or not is_sorted(actual_max_values)
-      assert (not is_sorted(actual_min_values, reverse=True) or
-              not is_sorted(actual_max_values, reverse=True))
-
-  def _validate_boundary_order(self, column_info):
-    """Validate that min/max values are really in the order specified by
-    boundary order.
-    """
-    column_index = column_info.column_index
-    self._validate_ordering(column_index.boundary_order, column_info.schema,
-        column_index.null_pages, column_index.min_values, column_index.max_values)
-
-  def _validate_parquet_page_index(self, hdfs_path, tmpdir):
-    """Validates that 'hdfs_path' contains exactly one parquet file and that the rowgroup
-    index in that file is in the valid format.
-    """
-    row_group_indexes = self._get_row_groups_from_hdfs_folder(hdfs_path, tmpdir)
-    for columns in row_group_indexes:
-      for column_info in columns:
-        try:
-          index_size = len(column_info.offset_index.page_locations)
-          assert index_size > 0
-          self._validate_page_locations(column_info.offset_index.page_locations)
-          # IMPALA-7304: Impala doesn't write column index for floating-point columns
-          # until PARQUET-1222 is resolved.
-          if column_info.schema.type in [4, 5]:
-            assert column_info.column_index is None
-            continue
-          self._validate_null_stats(index_size, column_info)
-          self._validate_min_max_values(index_size, column_info)
-          self._validate_boundary_order(column_info)
-        except AssertionError as e:
-          e.args += ("Validation failed on column {}.".format(column_info.schema.name),)
-          raise
-
-  def _ctas_table_and_verify_index(self, vector, unique_database, source_table,
-                                   tmpdir, sorting_column=None):
-    """Copies 'source_table' into a parquet table and makes sure that the index
-    in the resulting parquet file is valid.
-    """
-    table_name = "test_hdfs_parquet_table_writer"
-    qualified_table_name = "{0}.{1}".format(unique_database, table_name)
-    hdfs_path = get_fs_path('/test-warehouse/{0}.db/{1}/'.format(unique_database,
-                                                                 table_name))
-    # Setting num_nodes = 1 ensures that the query is executed on the coordinator,
-    # resulting in a single parquet file being written.
-    vector.get_value('exec_option')['num_nodes'] = 1
-    self.execute_query("drop table if exists {0}".format(qualified_table_name))
-    if sorting_column is None:
-      query = ("create table {0} stored as parquet as select * from {1}").format(
-          qualified_table_name, source_table)
-    else:
-      query = ("create table {0} sort by({1}) stored as parquet as select * from {2}"
-               ).format(qualified_table_name, sorting_column, source_table)
-    self.execute_query(query, vector.get_value('exec_option'))
-    self._validate_parquet_page_index(hdfs_path, tmpdir.join(source_table))
-
-  def _create_string_table_with_values(self, vector, unique_database, table_name,
-                                       values_sql):
-    """Creates a parquet table that has a single string column, then invokes an insert
-    statement on it with the 'values_sql' parameter. E.g. 'values_sql' is "('asdf')".
-    It returns the HDFS path for the table.
-    """
-    qualified_table_name = "{0}.{1}".format(unique_database, table_name)
-    self.execute_query("drop table if exists {0}".format(qualified_table_name))
-    vector.get_value('exec_option')['num_nodes'] = 1
-    query = ("create table {0} (str string) stored as parquet").format(qualified_table_name)
-    self.execute_query(query, vector.get_value('exec_option'))
-    self.execute_query("insert into {0} values {1}".format(qualified_table_name,
-        values_sql), vector.get_value('exec_option'))
-    return get_fs_path('/test-warehouse/{0}.db/{1}/'.format(unique_database,
-        table_name))
-
-  def test_write_index_alltypes(self, vector, unique_database, tmpdir):
-    """Test that writing a parquet file populates the rowgroup indexes with the correct
-    values.
-    """
-    self._ctas_table_and_verify_index(vector, unique_database, "functional.alltypes",
-        tmpdir)
-
-  def test_write_index_decimals(self, vector, unique_database, tmpdir):
-    """Test that writing a parquet file populates the rowgroup indexes with the correct
-    values, using decimal types.
-    """
-    self._ctas_table_and_verify_index(vector, unique_database, "functional.decimal_tbl",
-        tmpdir)
-
-  def test_write_index_chars(self, vector, unique_database, tmpdir):
-    """Test that writing a parquet file populates the rowgroup indexes with the correct
-    values, using char types.
-    """
-    self._ctas_table_and_verify_index(vector, unique_database, "functional.chars_formats",
-        tmpdir)
-
-  def test_write_index_null(self, vector, unique_database, tmpdir):
-    """Test that we don't write min/max values in the index for null columns.
-    Ensure null_count is set for columns with null values.
-    """
-    self._ctas_table_and_verify_index(vector, unique_database, "functional.nulltable",
-        tmpdir)
-
-  def test_write_index_multi_page(self, vector, unique_database, tmpdir):
-    """Test that when a ColumnChunk is written across multiple pages, the index is
-    valid.
-    """
-    self._ctas_table_and_verify_index(vector, unique_database, "tpch.customer",
-        tmpdir)
-    self._ctas_table_and_verify_index(vector, unique_database, "tpch.orders",
-        tmpdir)
-
-  def test_write_index_sorting_column(self, vector, unique_database, tmpdir):
-    """Test that when the schema has a sorting column, the index is valid."""
-    self._ctas_table_and_verify_index(vector, unique_database,
-        "functional_parquet.zipcode_incomes", tmpdir, "id")
-
-  def test_write_index_wide_table(self, vector, unique_database, tmpdir):
-    """Test table with wide row."""
-    self._ctas_table_and_verify_index(vector, unique_database,
-        "functional_parquet.widerow", tmpdir)
-
-  def test_write_index_many_columns_tables(self, vector, unique_database, tmpdir):
-    """Test tables with wide rows and many columns."""
-    self._ctas_table_and_verify_index(vector, unique_database,
-        "functional_parquet.widetable_250_cols", tmpdir)
-    self._ctas_table_and_verify_index(vector, unique_database,
-        "functional_parquet.widetable_500_cols", tmpdir)
-    self._ctas_table_and_verify_index(vector, unique_database,
-        "functional_parquet.widetable_1000_cols", tmpdir)
-
-  def test_max_string_values(self, vector, unique_database, tmpdir):
-    """Test string values that are all 0xFFs or end with 0xFFs."""
-
-    # String value is all of 0xFFs but its length is less than PAGE_INDEX_TRUNCATE_LENGTH.
-    short_tbl = "short_tbl"
-    short_hdfs_path = self._create_string_table_with_values(vector, unique_database,
-        short_tbl, "(rpad('', {0}, chr(255)))".format(PAGE_INDEX_MAX_STRING_LENGTH - 1))
-    self._validate_parquet_page_index(short_hdfs_path, tmpdir.join(short_tbl))
-
-    # String value is all of 0xFFs and its length is PAGE_INDEX_TRUNCATE_LENGTH.
-    fit_tbl = "fit_tbl"
-    fit_hdfs_path = self._create_string_table_with_values(vector, unique_database,
-        fit_tbl, "(rpad('', {0}, chr(255)))".format(PAGE_INDEX_MAX_STRING_LENGTH))
-    self._validate_parquet_page_index(fit_hdfs_path, tmpdir.join(fit_tbl))
-
-    # All bytes are 0xFFs and the string is longer then PAGE_INDEX_TRUNCATE_LENGTH, so we
-    # should not write page statistics.
-    too_long_tbl = "too_long_tbl"
-    too_long_hdfs_path = self._create_string_table_with_values(vector, unique_database,
-        too_long_tbl, "(rpad('', {0}, chr(255)))".format(PAGE_INDEX_MAX_STRING_LENGTH + 1))
-    row_group_indexes = self._get_row_groups_from_hdfs_folder(too_long_hdfs_path,
-        tmpdir.join(too_long_tbl))
-    column = row_group_indexes[0][0]
-    assert column.column_index is None
-    # We always write the offset index
-    assert column.offset_index is not None
-
-    # Test string with value that starts with 'aaa' following with 0xFFs and its length is
-    # greater than PAGE_INDEX_TRUNCATE_LENGTH. Max value should be 'aab'.
-    aaa_tbl = "aaa_tbl"
-    aaa_hdfs_path = self._create_string_table_with_values(vector, unique_database,
-        aaa_tbl, "(rpad('aaa', {0}, chr(255)))".format(PAGE_INDEX_MAX_STRING_LENGTH + 1))
-    row_group_indexes = self._get_row_groups_from_hdfs_folder(aaa_hdfs_path,
-        tmpdir.join(aaa_tbl))
-    column = row_group_indexes[0][0]
-    assert len(column.column_index.max_values) == 1
-    max_value = column.column_index.max_values[0]
-    assert max_value == 'aab'

[3/5] impala git commit: [DOCS] A typo fixed in ALTER VIEW syntax

Posted by jo...@apache.org.

[DOCS] A typo fixed in ALTER VIEW syntax

Change-Id: I859c279612d258ad8030fc58c83d36dbd285e210
Reviewed-on: http://gerrit.cloudera.org:8080/11711
Reviewed-by: Alex Rodoni <ar...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/275124e8
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/275124e8
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/275124e8

Branch: refs/heads/master
Commit: 275124e874f946cb12c4b97c655409107efbcc3c
Parents: 6c0512e
Author: Alex Rodoni <ar...@cloudera.com>
Authored: Wed Oct 17 12:36:47 2018 -0700
Committer: Alex Rodoni <ar...@cloudera.com>
Committed: Wed Oct 17 19:49:58 2018 +0000

----------------------------------------------------------------------
 docs/topics/impala_alter_view.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/275124e8/docs/topics/impala_alter_view.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_alter_view.xml b/docs/topics/impala_alter_view.xml
index f63152f..a503f85 100644
--- a/docs/topics/impala_alter_view.xml
+++ b/docs/topics/impala_alter_view.xml
@@ -68,7 +68,7 @@ ALTER VIEW [<varname>database_name</varname>.]<varname>view_name</varname>
    RENAME TO [<varname>database_name</varname>.]<varname>view_name</varname>;
 
 ALTER VIEW [<varname>database_name</varname>.]<varname>view_name</varname> SET OWNER USER user_name;
-ALTER TABLE [<varname>database_name</varname>.]<varname>view_name</varname> SET OWNER ROLE role_name;
+ALTER VIEW [<varname>database_name</varname>.]<varname>view_name</varname> SET OWNER ROLE role_name;
 </codeblock>
 
     <ul>

[2/5] impala git commit: [DOCS] Removed an invalid xref in impala_alter_table

Posted by jo...@apache.org.

[DOCS] Removed an invalid xref in impala_alter_table

Change-Id: Ia170d70c522a38cc90518041e3b2a696de5b4e4f
Reviewed-on: http://gerrit.cloudera.org:8080/11707
Reviewed-by: Alex Rodoni <ar...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/6c0512e1
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/6c0512e1
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/6c0512e1

Branch: refs/heads/master
Commit: 6c0512e1a329ada236f2b1e047179beca9dc2e5a
Parents: a0f351a
Author: Alex Rodoni <ar...@cloudera.com>
Authored: Wed Oct 17 11:47:14 2018 -0700
Committer: Alex Rodoni <ar...@cloudera.com>
Committed: Wed Oct 17 19:20:13 2018 +0000

----------------------------------------------------------------------
 docs/topics/impala_alter_table.xml | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/6c0512e1/docs/topics/impala_alter_table.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_alter_table.xml b/docs/topics/impala_alter_table.xml
index 6667f84..412aff0 100644
--- a/docs/topics/impala_alter_table.xml
+++ b/docs/topics/impala_alter_table.xml
@@ -531,13 +531,10 @@ ALTER TABLE <varname>table_name</varname> SET SERDEPROPERTIES ('<varname>key1</v
       single partition, or column statistics at the level of the entire table.
     </p>
 
-    <p>
-      You can set the <codeph>numrows</codeph> value for table statistics by changing the
-      <codeph>TBLPROPERTIES</codeph> setting for a table or partition. For example:
-<codeblock conref="../shared/impala_common.xml#common/set_numrows_example"/>
-<codeblock conref="../shared/impala_common.xml#common/set_numrows_partitioned_example"/>
-      See <xref href="impala_perf_stats.xml#perf_table_stats_manual"/> for details.
-    </p>
+    <p> You can set the <codeph>numrows</codeph> value for table statistics by
+      changing the <codeph>TBLPROPERTIES</codeph> setting for a table or
+      partition. For example: <codeblock conref="../shared/impala_common.xml#common/set_numrows_example"/>
+      <codeblock conref="../shared/impala_common.xml#common/set_numrows_partitioned_example"/></p>
 
     <p rev="2.6.0 IMPALA-3369">
       In <keyword keyref="impala26_full"/> and higher, you can use the <codeph>SET COLUMN

[5/5] impala git commit: IMPALA-7639: Move concurrent UDF tests to a custom cluster test

Posted by jo...@apache.org.

IMPALA-7639: Move concurrent UDF tests to a custom cluster test

Two test_udfs.py tests (test_native_functions_race and
test_concurrent_jar_drop_use) spawn dozens of connections to
test Impala behavior under concurrency. These connections
use up frontend service threads and can cause shell tests
to timeout when trying to connect.

This moves both tests to a new TestUdfConcurrency custom
cluster test. The new custom cluster test uses a larger
fe_service_threads value to allow full concurrency. The
tests run serially and cannot impact other tests.

This also reduces the test dimensions for test_native_functions_race
so that it runs one configuration rather than eight.

Change-Id: I3f255823167a4dd807a07276f630ef02435900a3
Reviewed-on: http://gerrit.cloudera.org:8080/11701
Reviewed-by: Joe McDonnell <jo...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/6399a65a
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/6399a65a
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/6399a65a

Branch: refs/heads/master
Commit: 6399a65a00cfb6b48da29acbb0921a360bf3a019
Parents: de7f09d
Author: Joe McDonnell <jo...@cloudera.com>
Authored: Tue Oct 16 13:07:35 2018 -0700
Committer: Joe McDonnell <jo...@cloudera.com>
Committed: Wed Oct 17 21:31:24 2018 +0000

----------------------------------------------------------------------
 tests/custom_cluster/test_udf_concurrency.py | 206 ++++++++++++++++++++++
 tests/query_test/test_udfs.py                | 162 -----------------
 2 files changed, 206 insertions(+), 162 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/6399a65a/tests/custom_cluster/test_udf_concurrency.py
----------------------------------------------------------------------
diff --git a/tests/custom_cluster/test_udf_concurrency.py b/tests/custom_cluster/test_udf_concurrency.py
new file mode 100644
index 0000000..f0fac27
--- /dev/null
+++ b/tests/custom_cluster/test_udf_concurrency.py
@@ -0,0 +1,206 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import pytest
+import random
+import threading
+import time
+from subprocess import check_call
+
+from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
+from tests.common.impala_cluster import ImpalaCluster
+from tests.util.filesystem_utils import get_fs_path
+
+# This custom cluster test splits out concurrency tests to allow running with
+# a higher fe_service_threads (and thus higher concurrency). This also avoids
+# side-effects for other tests (see IMPALA-7639).
+
+
+class TestUdfConcurrency(CustomClusterTestSuite):
+
+  @classmethod
+  def get_workload(self):
+    return 'functional-query'
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestUdfConcurrency, cls).add_test_dimensions()
+
+  @pytest.mark.execute_serially
+  @CustomClusterTestSuite.with_args(impalad_args="--fe_service_threads=1000")
+  def test_native_functions_race(self, vector, unique_database):
+    """ IMPALA-6488: stress concurrent adds, uses, and deletes of native functions.
+        Exposes a crash caused by use-after-free in lib-cache."""
+
+    # Native function used by a query. Stresses lib-cache during analysis and
+    # backend expressions.
+    create_fn_to_use = \
+      """create function {0}.use_it(string) returns string
+         LOCATION '{1}'
+         SYMBOL='_Z8IdentityPN10impala_udf15FunctionContextERKNS_9StringValE'"""
+    use_fn = """select * from (select max(int_col) from functional.alltypesagg
+                where {0}.use_it(string_col) = 'blah' union all
+                (select max(int_col) from functional.alltypesagg
+                 where {0}.use_it(String_col) > '1' union all
+                (select max(int_col) from functional.alltypesagg
+                 where {0}.use_it(string_col) > '1'))) v"""
+    # Reference to another native function from the same 'so' file. Creating/dropping
+    # stresses lib-cache lookup, add, and refresh.
+    create_another_fn = """create function if not exists {0}.other(float)
+                           returns float location '{1}' symbol='Identity'"""
+    drop_another_fn = """drop function if exists {0}.other(float)"""
+    udf_path = get_fs_path('/test-warehouse/libTestUdfs.so')
+
+    # Tracks number of impalads prior to tests to check that none have crashed.
+    # All impalads are assumed to be coordinators.
+    cluster = ImpalaCluster()
+    exp_num_coordinators = cluster.num_responsive_coordinators()
+
+    setup_client = self.create_impala_client()
+    setup_query = create_fn_to_use.format(unique_database, udf_path)
+    try:
+      setup_client.execute(setup_query)
+    except Exception as e:
+      print "Unable to create initial function: {0}".format(setup_query)
+      raise
+
+    errors = []
+
+    def use_fn_method():
+      time.sleep(1 + random.random())
+      client = self.create_impala_client()
+      query = use_fn.format(unique_database)
+      try:
+        client.execute(query)
+      except Exception as e:
+        errors.append(e)
+
+    def load_fn_method():
+      time.sleep(1 + random.random())
+      client = self.create_impala_client()
+      drop = drop_another_fn.format(unique_database)
+      create = create_another_fn.format(unique_database, udf_path)
+      try:
+        client.execute(drop)
+        client.execute(create)
+      except Exception as e:
+        errors.append(e)
+
+    # number of uses/loads needed to reliably reproduce the bug.
+    num_uses = 200
+    num_loads = 200
+
+    # create threads to use native function.
+    runner_threads = []
+    for i in xrange(num_uses):
+      runner_threads.append(threading.Thread(target=use_fn_method))
+
+    # create threads to drop/create native functions.
+    for i in xrange(num_loads):
+      runner_threads.append(threading.Thread(target=load_fn_method))
+
+    # launch all runner threads.
+    for t in runner_threads: t.start()
+
+    # join all threads.
+    for t in runner_threads: t.join()
+
+    for e in errors: print e
+
+    # Checks that no impalad has crashed.
+    assert cluster.num_responsive_coordinators() == exp_num_coordinators
+
+  @pytest.mark.execute_serially
+  @CustomClusterTestSuite.with_args(impalad_args="--fe_service_threads=1000")
+  def test_concurrent_jar_drop_use(self, vector, unique_database):
+    """IMPALA-6215: race between dropping/using java udf's defined in the same jar.
+       This test runs concurrent drop/use threads that result in class not found
+       exceptions when the race is present.
+    """
+    udf_src_path = os.path.join(
+      os.environ['IMPALA_HOME'], "testdata/udfs/impala-hive-udfs.jar")
+    udf_tgt_path = get_fs_path(
+      '/test-warehouse/{0}.db/impala-hive-udfs.jar'.format(unique_database))
+
+    create_fn_to_drop = """create function {0}.foo_{1}() returns string
+                           LOCATION '{2}' SYMBOL='org.apache.impala.TestUpdateUdf'"""
+    create_fn_to_use = """create function {0}.use_it(string) returns string
+                          LOCATION '{1}' SYMBOL='org.apache.impala.TestUdf'"""
+    drop_fn = "drop function if exists {0}.foo_{1}()"
+    use_fn = """select * from (select max(int_col) from functional.alltypesagg
+                where {0}.use_it(string_col) = 'blah' union all
+                (select max(int_col) from functional.alltypesagg
+                 where {0}.use_it(String_col) > '1' union all
+                (select max(int_col) from functional.alltypesagg
+                 where {0}.use_it(string_col) > '1'))) v"""
+    num_drops = 100
+    num_uses = 100
+
+    # use a unique jar for this test to avoid interactions with other tests
+    # that use the same jar
+    check_call(["hadoop", "fs", "-put", "-f", udf_src_path, udf_tgt_path])
+
+    # create all the functions.
+    setup_client = self.create_impala_client()
+    try:
+      s = create_fn_to_use.format(unique_database, udf_tgt_path)
+      setup_client.execute(s)
+    except Exception as e:
+      print e
+      assert False
+    for i in range(0, num_drops):
+      try:
+        setup_client.execute(create_fn_to_drop.format(unique_database, i, udf_tgt_path))
+      except Exception as e:
+        print e
+        assert False
+
+    errors = []
+
+    def use_fn_method():
+      time.sleep(5 + random.random())
+      client = self.create_impala_client()
+      try:
+        client.execute(use_fn.format(unique_database))
+      except Exception as e: errors.append(e)
+
+    def drop_fn_method(i):
+      time.sleep(1 + random.random())
+      client = self.create_impala_client()
+      try:
+        client.execute(drop_fn.format(unique_database, i))
+      except Exception as e: errors.append(e)
+
+    # create threads to use functions.
+    runner_threads = []
+    for i in range(0, num_uses):
+      runner_threads.append(threading.Thread(target=use_fn_method))
+
+    # create threads to drop functions.
+    for i in range(0, num_drops):
+      runner_threads.append(threading.Thread(target=drop_fn_method, args=(i, )))
+
+    # launch all runner threads.
+    for t in runner_threads: t.start()
+
+    # join all threads.
+    for t in runner_threads: t.join()
+
+    # Check for any errors.
+    for e in errors: print e
+    assert len(errors) == 0

http://git-wip-us.apache.org/repos/asf/impala/blob/6399a65a/tests/query_test/test_udfs.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_udfs.py b/tests/query_test/test_udfs.py
index e93f511..6bb9b94 100644
--- a/tests/query_test/test_udfs.py
+++ b/tests/query_test/test_udfs.py
@@ -18,9 +18,6 @@
 from copy import copy
 import os
 import pytest
-import random
-import threading
-import time
 import tempfile
 from subprocess import call, check_call
 
@@ -298,86 +295,6 @@ class TestUdfExecution(TestUdfBase):
       self.run_test_case('QueryTest/udf-non-deterministic', vector,
           use_db=unique_database)
 
-  def test_native_functions_race(self, vector, unique_database):
-    """ IMPALA-6488: stress concurrent adds, uses, and deletes of native functions.
-        Exposes a crash caused by use-after-free in lib-cache."""
-
-    # Native function used by a query. Stresses lib-cache during analysis and
-    # backend expressions.
-    create_fn_to_use = """create function {0}.use_it(string) returns string
-                          LOCATION '{1}'
-                          SYMBOL='_Z8IdentityPN10impala_udf15FunctionContextERKNS_9StringValE'"""
-    use_fn = """select * from (select max(int_col) from functional.alltypesagg
-                where {0}.use_it(string_col) = 'blah' union all
-                (select max(int_col) from functional.alltypesagg
-                 where {0}.use_it(String_col) > '1' union all
-                (select max(int_col) from functional.alltypesagg
-                 where {0}.use_it(string_col) > '1'))) v"""
-    # Reference to another native function from the same 'so' file. Creating/dropping
-    # stresses lib-cache lookup, add, and refresh.
-    create_another_fn = """create function if not exists {0}.other(float)
-                           returns float location '{1}' symbol='Identity'"""
-    drop_another_fn = """drop function if exists {0}.other(float)"""
-    udf_path = get_fs_path('/test-warehouse/libTestUdfs.so')
-
-    # Tracks number of impalads prior to tests to check that none have crashed.
-    # All impalads are assumed to be coordinators.
-    cluster = ImpalaCluster()
-    exp_num_coordinators = cluster.num_responsive_coordinators()
-
-    setup_client = self.create_impala_client()
-    setup_query = create_fn_to_use.format(unique_database, udf_path)
-    try:
-      setup_client.execute(setup_query)
-    except Exception as e:
-      print "Unable to create initial function: {0}".format(setup_query)
-      raise
-
-    errors = []
-    def use_fn_method():
-      time.sleep(1 + random.random())
-      client = self.create_impala_client()
-      query = use_fn.format(unique_database)
-      try:
-        client.execute(query)
-      except Exception as e:
-        errors.append(e)
-
-    def load_fn_method():
-      time.sleep(1 + random.random())
-      client = self.create_impala_client()
-      drop = drop_another_fn.format(unique_database)
-      create = create_another_fn.format(unique_database, udf_path)
-      try:
-        client.execute(drop)
-        client.execute(create)
-      except Exception as e:
-        errors.append(e)
-
-    # number of uses/loads needed to reliably reproduce the bug.
-    num_uses = 200
-    num_loads = 200
-
-    # create threads to use native function.
-    runner_threads = []
-    for i in xrange(num_uses):
-      runner_threads.append(threading.Thread(target=use_fn_method))
-
-    # create threads to drop/create native functions.
-    for i in xrange(num_loads):
-      runner_threads.append(threading.Thread(target=load_fn_method))
-
-    # launch all runner threads.
-    for t in runner_threads: t.start()
-
-    # join all threads.
-    for t in runner_threads: t.join();
-
-    for e in errors: print e
-
-    # Checks that no impalad has crashed.
-    assert cluster.num_responsive_coordinators() == exp_num_coordinators
-
   def test_ir_functions(self, vector, unique_database):
     if vector.get_value('exec_option')['disable_codegen']:
       # IR functions require codegen to be enabled.
@@ -516,85 +433,6 @@ class TestUdfTargeted(TestUdfBase):
       assert "Unable to find class" in str(ex)
     self.client.execute(drop_fn_stmt)
 
-  def test_concurrent_jar_drop_use(self, vector, unique_database):
-    """IMPALA-6215: race between dropping/using java udf's defined in the same jar.
-       This test runs concurrent drop/use threads that result in class not found
-       exceptions when the race is present.
-    """
-    udf_src_path = os.path.join(
-      os.environ['IMPALA_HOME'], "testdata/udfs/impala-hive-udfs.jar")
-    udf_tgt_path = get_fs_path(
-      '/test-warehouse/{0}.db/impala-hive-udfs.jar'.format(unique_database))
-
-    create_fn_to_drop = """create function {0}.foo_{1}() returns string
-                           LOCATION '{2}' SYMBOL='org.apache.impala.TestUpdateUdf'"""
-    create_fn_to_use = """create function {0}.use_it(string) returns string
-                          LOCATION '{1}' SYMBOL='org.apache.impala.TestUdf'"""
-    drop_fn = "drop function if exists {0}.foo_{1}()"
-    use_fn = """select * from (select max(int_col) from functional.alltypesagg
-                where {0}.use_it(string_col) = 'blah' union all
-                (select max(int_col) from functional.alltypesagg
-                 where {0}.use_it(String_col) > '1' union all
-                (select max(int_col) from functional.alltypesagg
-                 where {0}.use_it(string_col) > '1'))) v"""
-    num_drops = 100
-    num_uses = 100
-
-    # use a unique jar for this test to avoid interactions with other tests
-    # that use the same jar
-    check_call(["hadoop", "fs", "-put", "-f", udf_src_path, udf_tgt_path])
-
-    # create all the functions.
-    setup_client = self.create_impala_client()
-    try:
-      s = create_fn_to_use.format(unique_database, udf_tgt_path)
-      setup_client.execute(s)
-    except Exception as e:
-      print e
-      assert False
-    for i in range(0, num_drops):
-      try:
-        setup_client.execute(create_fn_to_drop.format(unique_database, i, udf_tgt_path))
-      except Exception as e:
-        print e
-        assert False
-
-    errors = []
-    def use_fn_method():
-      time.sleep(5 + random.random())
-      client = self.create_impala_client()
-      try:
-        client.execute(use_fn.format(unique_database))
-      except Exception as e: errors.append(e)
-
-    def drop_fn_method(i):
-      time.sleep(1 + random.random())
-      client = self.create_impala_client()
-      try:
-        client.execute(drop_fn.format(unique_database, i))
-      except Exception as e: errors.append(e)
-
-    # create threads to use functions.
-    runner_threads = []
-    for i in range(0, num_uses):
-      runner_threads.append(threading.Thread(target=use_fn_method))
-
-    # create threads to drop functions.
-    drop_threads = []
-    for i in range(0, num_drops):
-      runner_threads.append(threading.Thread(target=drop_fn_method, args=(i, )))
-
-    # launch all runner threads.
-    for t in runner_threads: t.start()
-
-    # join all threads.
-    for t in runner_threads: t.join();
-
-    # Check for any errors.
-    for e in errors: print e
-    assert len(errors) == 0
-
-
   @SkipIfLocal.multiple_impalad
   def test_hive_udfs_missing_jar(self, vector, unique_database):
     """ IMPALA-2365: Impalad shouldn't crash if the udf jar isn't present