You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2019/07/26 21:40:35 UTC

[impala] 05/08: IMPALA-7991 IMPALA-8575: [DOCS] Document the query options for Parquet page indexes

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 21a600409bb3d780d171abd7b5f54b69a601553c
Author: Alex Rodoni <ar...@cloudera.com>
AuthorDate: Tue Jul 23 13:38:28 2019 -0700

    IMPALA-7991 IMPALA-8575: [DOCS] Document the query options for Parquet page indexes
    
    - The following options were documented:
    - parquet_read_page_index
    - parquet_write_page_index
    - parquet_page_row_count_limit
    
    Change-Id: I46c1941269feff18306863f784aa36f5037da1db
    Reviewed-on: http://gerrit.cloudera.org:8080/13900
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-by: Zoltan Borok-Nagy <bo...@cloudera.com>
---
 docs/impala.ditamap                                |   3 +
 docs/topics/impala_parquet.xml                     |   3 +
 .../topics/impala_parquet_page_row_count_limit.xml |  63 +++++++++++++
 docs/topics/impala_parquet_read_page_index.xml     | 100 +++++++++++++++++++++
 docs/topics/impala_parquet_write_page_index.xml    |  86 ++++++++++++++++++
 5 files changed, 255 insertions(+)

diff --git a/docs/impala.ditamap b/docs/impala.ditamap
index 5e3bae1..b9bdb6d 100644
--- a/docs/impala.ditamap
+++ b/docs/impala.ditamap
@@ -216,7 +216,10 @@ under the License.
           <topicref href="topics/impala_parquet_dictionary_filtering.xml"/>
           <topicref rev="2.6.0 IMPALA-2835" href="topics/impala_parquet_fallback_schema_resolution.xml"/>
           <topicref href="topics/impala_parquet_file_size.xml"/>
+          <topicref href="topics/impala_parquet_page_row_count_limit.xml"/>
           <topicref href="topics/impala_parquet_read_statistics.xml"/>
+          <topicref href="topics/impala_parquet_read_page_index.xml"/>
+          <topicref href="topics/impala_parquet_write_page_index.xml"/>
           <topicref rev="2.6.0 IMPALA-3286" href="topics/impala_prefetch_mode.xml"/>
           <topicref href="topics/impala_query_timeout_s.xml"/>
           <topicref rev="2.7.0" href="topics/impala_replica_preference.xml"/>
diff --git a/docs/topics/impala_parquet.xml b/docs/topics/impala_parquet.xml
index d36b445..d4cdba7 100644
--- a/docs/topics/impala_parquet.xml
+++ b/docs/topics/impala_parquet.xml
@@ -355,6 +355,9 @@ under the License.
         tables produces Parquet data files with relatively narrow ranges of column values within
         each file.
       </p>
+      <p>To disable Impala from writing the Parquet page index when creating
+        Parquet files, set the <codeph>PARQUET_WRITE_PAGE_INDEX</codeph> query
+        option to <codeph>FALSE</codeph>.</p>
 
     </conbody>
 
diff --git a/docs/topics/impala_parquet_page_row_count_limit.xml b/docs/topics/impala_parquet_page_row_count_limit.xml
new file mode 100644
index 0000000..d71f348
--- /dev/null
+++ b/docs/topics/impala_parquet_page_row_count_limit.xml
@@ -0,0 +1,63 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="parquet_page_row_count_limit">
+
+  <title>PARQUET_PAGE_ROW_COUNT_LIMIT Query Option</title>
+
+  <titlealts audience="PDF">
+
+    <navtitle>PARQUET_PAGE_ROW_COUNT_LIMIT</navtitle>
+
+  </titlealts>
+
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Parquet"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      Use the <codeph>PARQUET_PAGE_ROW_COUNT_LIMIT</codeph> query option to set the maximum
+      number of rows that can be written on a single Parquet data page. By default there is no
+      row count limit.
+    </p>
+
+    <p>
+      <b>Type:</b> <codeph>INT</codeph>
+    </p>
+
+    <p>
+      <b>Allowed values:</b> Positive integers
+    </p>
+
+    <p>
+      <b>Added in:</b> <keyword keyref="impala33"/>
+    </p>
+
+  </conbody>
+
+</concept>
diff --git a/docs/topics/impala_parquet_read_page_index.xml b/docs/topics/impala_parquet_read_page_index.xml
new file mode 100644
index 0000000..149f00f
--- /dev/null
+++ b/docs/topics/impala_parquet_read_page_index.xml
@@ -0,0 +1,100 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="parquet_read_page_index">
+
+  <title>PARQUET_READ_PAGE_INDEX Query Option</title>
+
+  <titlealts audience="PDF">
+
+    <navtitle>PARQUET_READ_PAGE_INDEX</navtitle>
+
+  </titlealts>
+
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Parquet"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      Use the <codeph>PARQUET_READ_PAGE_INDEX</codeph> query option to disable or enable using
+      the Parquet page index during scans. The page index contains min/max statistics at the
+      page-level granularity. It can be used to skip pages and rows that do not match the
+      conditions in the <codeph>WHERE</codeph> clause.
+    </p>
+
+    <p>
+      This option enables the same optimization as the <codeph>PARQUET_READ_STATISTICS</codeph>
+      at the finer grained page level.
+    </p>
+
+    <p>
+      Impala supports filtering based on Parquet statistics:
+    </p>
+
+    <ul>
+      <li>
+        Of the types: Boolean, Integer, Decimal, String, Timestamp
+      </li>
+
+      <li>
+        For simple predicates of the forms: <codeph>&lt;slot> &lt;op> &lt;constant></codeph> or
+        <codeph>&lt;constant> &lt;op> &lt;slot></codeph>, where <codeph>&lt;op></codeph> is LT,
+        LE, GE, GT, and EQ
+      </li>
+    </ul>
+
+    <p>
+      The supported values for the query option are:
+      <ul>
+        <li>
+          <codeph>true</codeph> (<codeph>1</codeph>): Read the page-level statistics from the
+          Parquet page index during query processing and filter out pages based on the
+          statistics.
+        </li>
+
+        <li>
+          <codeph>false</codeph> (<codeph>0</codeph>): Do not use the Parquet page index.
+        </li>
+
+        <li>
+          Any other values are treated as <codeph>false</codeph>.
+        </li>
+      </ul>
+    </p>
+
+    <p>
+      <b>Type:</b> Boolean
+    </p>
+
+    <p>
+      <b>Default:</b> <codeph>TRUE</codeph>
+    </p>
+
+  </conbody>
+
+</concept>
diff --git a/docs/topics/impala_parquet_write_page_index.xml b/docs/topics/impala_parquet_write_page_index.xml
new file mode 100644
index 0000000..49d0ee4
--- /dev/null
+++ b/docs/topics/impala_parquet_write_page_index.xml
@@ -0,0 +1,86 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="parquet_write_page_index">
+
+  <title>PARQUET_WRITE_PAGE_INDEX Query Option</title>
+
+  <titlealts audience="PDF">
+
+    <navtitle>PARQUET_WRITE_PAGE_INDEX</navtitle>
+
+  </titlealts>
+
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Parquet"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      The <codeph>PARQUET_WRITE_PAGE_INDEX</codeph> query option to disable or enable the
+      Parquet page index writing.
+    </p>
+
+    <p>
+      Impala writes page-level statistics into the Parquet page index of the types: Boolean,
+      Integer, Decimal, String, Timestamp
+    </p>
+
+    <p>
+      The supported values for the query option are:
+      <ul>
+        <li>
+          <codeph>true</codeph> (<codeph>1</codeph>): Write the Parquet page index when creating
+          Parquet files.
+        </li>
+
+        <li>
+          <codeph>false</codeph> (<codeph>0</codeph>): Do not write the Parquet page index when
+          creating Parquet files.
+        </li>
+
+        <li>
+          Any other values are treated as <codeph>false</codeph>.
+        </li>
+      </ul>
+    </p>
+
+    <p>
+      <b>Type:</b> Boolean
+    </p>
+
+    <p>
+      <b>Default:</b> <codeph>TRUE</codeph>
+    </p>
+
+    <p>
+      <b>Added in:</b> <keyword keyref="impala33"/>
+    </p>
+
+  </conbody>
+
+</concept>