You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jr...@apache.org on 2016/10/29 00:33:55 UTC
[7/7] incubator-impala git commit: New files needed to make PDF build happy.

New files needed to make PDF build happy.


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/1fcc8cee
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/1fcc8cee
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/1fcc8cee

Branch: refs/heads/doc_prototype
Commit: 1fcc8ceecf31c8602594b626dfb25f67324537f6
Parents: 8039fbb
Author: John Russell <jr...@cloudera.com>
Authored: Fri Oct 28 17:33:41 2016 -0700
Committer: John Russell <jr...@cloudera.com>
Committed: Fri Oct 28 17:33:41 2016 -0700

----------------------------------------------------------------------
 docs/Cloudera-Impala-Release-Notes.ditamap    |   10 +
 docs/topics/impala_admin.xml                  |   60 +
 docs/topics/impala_auditing.xml               |  260 +++
 docs/topics/impala_authentication.xml         |   39 +
 docs/topics/impala_cluster_sizing.xml         |  353 ++++
 docs/topics/impala_cm_installation.xml        |   56 +
 docs/topics/impala_concepts.xml               |  295 +++
 docs/topics/impala_config.xml                 |   57 +
 docs/topics/impala_config_options.xml         |  593 ++++++
 docs/topics/impala_config_performance.xml     |  291 +++
 docs/topics/impala_connecting.xml             |  202 +++
 docs/topics/impala_delegation.xml             |   88 +
 docs/topics/impala_development.xml            |  229 +++
 docs/topics/impala_faq.xml                    | 1880 ++++++++++++++++++++
 docs/topics/impala_intro.xml                  |   81 +
 docs/topics/impala_kerberos.xml               |  370 ++++
 docs/topics/impala_ldap.xml                   |  354 ++++
 docs/topics/impala_lineage.xml                |  113 ++
 docs/topics/impala_mixed_security.xml         |   46 +
 docs/topics/impala_noncm_installation.xml     |  175 ++
 docs/topics/impala_perf_benchmarking.xml      |   36 +
 docs/topics/impala_perf_cookbook.xml          |  269 +++
 docs/topics/impala_perf_resources.xml         |   60 +
 docs/topics/impala_perf_skew.xml              |  150 ++
 docs/topics/impala_perf_testing.xml           |  175 ++
 docs/topics/impala_planning.xml               |   30 +
 docs/topics/impala_ports.xml                  |  440 +++++
 docs/topics/impala_proxy.xml                  |  635 +++++++
 docs/topics/impala_rcfile.xml                 |  244 +++
 docs/topics/impala_release_notes.xml          |   17 +
 docs/topics/impala_schema_design.xml          |  222 +++
 docs/topics/impala_security_files.xml         |   67 +
 docs/topics/impala_security_guidelines.xml    |  108 ++
 docs/topics/impala_security_install.xml       |   24 +
 docs/topics/impala_security_metastore.xml     |   40 +
 docs/topics/impala_security_webui.xml         |   66 +
 docs/topics/impala_seqfile.xml                |  239 +++
 docs/topics/impala_shell_commands.xml         |  399 +++++
 docs/topics/impala_shell_running_commands.xml |  265 +++
 docs/topics/impala_ssl.xml                    |  256 +++
 docs/topics/impala_troubleshooting.xml        |  447 +++++
 docs/topics/impala_webui.xml                  |  650 +++++++
 docs/topics/rg_impala_vd.xml                  | 1165 ++++++++++++
 43 files changed, 11556 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/Cloudera-Impala-Release-Notes.ditamap
----------------------------------------------------------------------
diff --git a/docs/Cloudera-Impala-Release-Notes.ditamap b/docs/Cloudera-Impala-Release-Notes.ditamap
new file mode 100644
index 0000000..7545b2e
--- /dev/null
+++ b/docs/Cloudera-Impala-Release-Notes.ditamap
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE map PUBLIC "-//OASIS//DTD DITA Map//EN" "map.dtd">
+<map audience="standalone">
+  <title>Cloudera Impala Release Notes</title>
+  <topicref href="topics/impala_relnotes.xml" audience="HTML standalone"/>
+  <topicref href="topics/impala_new_features.xml"/>
+  <topicref href="topics/impala_incompatible_changes.xml"/>
+  <topicref href="topics/impala_known_issues.xml"/>
+  <topicref href="topics/impala_fixed_issues.xml"/>
+</map>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_admin.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_admin.xml b/docs/topics/impala_admin.xml
new file mode 100644
index 0000000..3da7d5f
--- /dev/null
+++ b/docs/topics/impala_admin.xml
@@ -0,0 +1,60 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="admin">
+
+  <title>Impala Administration</title>
+  <titlealts audience="PDF"><navtitle>Administration</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Administrators"/>
+      <!-- Although there is a reasonable amount of info on the page, it could be better to use wiki-style embedding instead of linking hither and thither. -->
+      <data name="Category" value="Stub Pages"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      As an administrator, you monitor Impala's use of resources and take action when necessary to keep Impala
+      running smoothly and avoid conflicts with other Hadoop components running on the same cluster. When you
+      detect that an issue has happened or could happen in the future, you reconfigure Impala or other components
+      such as HDFS or even the hardware of the cluster itself to resolve or avoid problems.
+    </p>
+
+    <p outputclass="toc"/>
+
+    <p>
+      <b>Related tasks:</b>
+    </p>
+
+    <p>
+      As an administrator, you can expect to perform installation, upgrade, and configuration tasks for Impala on
+      all machines in a cluster. See <xref href="impala_install.xml#install"/>,
+      <xref href="impala_upgrading.xml#upgrading"/>, and <xref href="impala_config.xml#config"/> for details.
+    </p>
+
+    <p>
+      For security tasks typically performed by administrators, see <xref href="impala_security.xml#security"/>.
+    </p>
+
+    <p>
+      Administrators also decide how to allocate cluster resources so that all Hadoop components can run smoothly
+      together. For Impala, this task primarily involves:
+      <ul>
+        <li>
+          Deciding how many Impala queries can run concurrently and with how much memory, through the admission
+          control feature. See <xref href="impala_admission.xml#admission_control"/> for details.
+        </li>
+
+        <li>
+          Dividing cluster resources such as memory between Impala and other components, using YARN for overall
+          resource management, and Llama to mediate resource requests from Impala to YARN. See
+          <xref href="impala_resource_management.xml#resource_management"/> for details.
+        </li>
+      </ul>
+    </p>
+
+<!-- <p conref="../shared/impala_common.xml#common/impala_mr"/> -->
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_auditing.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_auditing.xml b/docs/topics/impala_auditing.xml
new file mode 100644
index 0000000..6332957
--- /dev/null
+++ b/docs/topics/impala_auditing.xml
@@ -0,0 +1,260 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="auditing">
+
+  <title>Auditing Impala Operations</title>
+  <titlealts audience="PDF"><navtitle>Auditing</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Auditing"/>
+      <data name="Category" value="Governance"/>
+      <data name="Category" value="Navigator"/>
+      <data name="Category" value="Security"/>
+      <data name="Category" value="Administrators"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      To monitor how Impala data is being used within your organization, ensure that your Impala authorization and
+      authentication policies are effective, and detect attempts at intrusion or unauthorized access to Impala
+      data, you can use the auditing feature in Impala 1.2.1 and higher:
+    </p>
+
+    <ul>
+      <li>
+        Enable auditing by including the option <codeph>-audit_event_log_dir=<varname>directory_path</varname></codeph>
+        in your <cmdname>impalad</cmdname> startup options for a cluster not managed by Cloudera Manager, or
+        <xref audience="integrated" href="cn_iu_audit_log.xml#xd_583c10bfdbd326ba--6eed2fb8-14349d04bee--7d6f/section_v25_lmy_bn">configuring Impala Daemon logging in Cloudera Manager</xref><xref audience="standalone" href="http://www.cloudera.com/documentation/enterprise/latest/topics/cn_iu_service_audit.html" scope="external" format="html">configuring Impala Daemon logging in Cloudera Manager</xref>.
+        The log directory must be a local directory on the
+        server, not an HDFS directory.
+      </li>
+
+      <li>
+        Decide how many queries will be represented in each log files. By default, Impala starts a new log file
+        every 5000 queries. To specify a different number, <ph
+          audience="standalone"
+          >include
+        the option <codeph>-max_audit_event_log_file_size=<varname>number_of_queries</varname></codeph> in the
+        <cmdname>impalad</cmdname> startup
+        options</ph><xref
+          href="cn_iu_audit_log.xml#xd_583c10bfdbd326ba--6eed2fb8-14349d04bee--7d6f/section_v25_lmy_bn"
+            audience="integrated"
+            >configure
+        Impala Daemon logging in Cloudera Manager</xref>.
+      </li>
+
+      <li> Configure Cloudera Navigator to collect and consolidate the audit
+        logs from all the hosts in the cluster. </li>
+
+      <li>
+        Use Cloudera Navigator or Cloudera Manager to filter, visualize, and produce reports based on the audit
+        data. (The Impala auditing feature works with Cloudera Manager 4.7 to 5.1 and Cloudera Navigator 2.1 and
+        higher.) Check the audit data to ensure that all activity is authorized and detect attempts at
+        unauthorized access.
+      </li>
+    </ul>
+
+    <p outputclass="toc inpage"/>
+  </conbody>
+
+  <concept id="auditing_performance">
+
+    <title>Durability and Performance Considerations for Impala Auditing</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Performance"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <p>
+        The auditing feature only imposes performance overhead while auditing is enabled.
+      </p>
+
+      <p>
+        Because any Impala host can process a query, enable auditing on all hosts where the
+        <ph audience="standalone"><cmdname>impalad</cmdname> daemon</ph>
+        <ph audience="integrated">Impala Daemon role</ph> runs. Each host stores its own log
+        files, in a directory in the local filesystem. The log data is periodically flushed to disk (through an
+        <codeph>fsync()</codeph> system call) to avoid loss of audit data in case of a crash.
+      </p>
+
+      <p> The runtime overhead of auditing applies to whichever host serves as the coordinator for the query, that is, the host you connect to when you issue the query. This might be the same host for all queries, or different applications or users might connect to and issue queries through different hosts. </p>
+
+      <p> To avoid excessive I/O overhead on busy coordinator hosts, Impala syncs the audit log data (using the <codeph>fsync()</codeph> system call) periodically rather than after every query. Currently, the <codeph>fsync()</codeph> calls are issued at a fixed interval, every 5 seconds. </p>
+
+      <p>
+        By default, Impala avoids losing any audit log data in the case of an error during a logging operation
+        (such as a disk full error), by immediately shutting down
+        <cmdname audience="standalone">impalad</cmdname><ph audience="integrated">the Impala
+        Daemon role</ph> on the host where the auditing problem occurred.
+        <ph audience="standalone">You can override this setting by specifying the option
+        <codeph>-abort_on_failed_audit_event=false</codeph> in the <cmdname>impalad</cmdname> startup options.</ph>
+      </p>
+    </conbody>
+  </concept>
+
+  <concept id="auditing_format">
+
+    <title>Format of the Audit Log Files</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Logs"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <p> The audit log files represent the query information in JSON format, one query per line. Typically, rather than looking at the log files themselves, you use the Cloudera Navigator product to consolidate the log data from all Impala hosts and filter and visualize the results in useful ways. (If you do examine the raw log data, you might run the files through a JSON pretty-printer first.) </p>
+
+      <p>
+        All the information about schema objects accessed by the query is encoded in a single nested record on the
+        same line. For example, the audit log for an <codeph>INSERT ... SELECT</codeph> statement records that a
+        select operation occurs on the source table and an insert operation occurs on the destination table. The
+        audit log for a query against a view records the base table accessed by the view, or multiple base tables
+        in the case of a view that includes a join query. Every Impala operation that corresponds to a SQL
+        statement is recorded in the audit logs, whether the operation succeeds or fails. Impala records more
+        information for a successful operation than for a failed one, because an unauthorized query is stopped
+        immediately, before all the query planning is completed.
+      </p>
+
+<!-- Opportunity to conref at the phrase level here... the content of this paragraph is the same as part
+     of a list bullet earlier on. -->
+
+      <p>
+        The information logged for each query includes:
+      </p>
+
+      <ul>
+        <li>
+          Client session state:
+          <ul>
+            <li>
+              Session ID
+            </li>
+
+            <li>
+              User name
+            </li>
+
+            <li>
+              Network address of the client connection
+            </li>
+          </ul>
+        </li>
+
+        <li>
+          SQL statement details:
+          <ul>
+            <li>
+              Query ID
+            </li>
+
+            <li>
+              Statement Type - DML, DDL, and so on
+            </li>
+
+            <li>
+              SQL statement text
+            </li>
+
+            <li>
+              Execution start time, in local time
+            </li>
+
+            <li>
+              Execution Status - Details on any errors that were encountered
+            </li>
+
+            <li>
+              Target Catalog Objects:
+              <ul>
+                <li>
+                  Object Type - Table, View, or Database
+                </li>
+
+                <li>
+                  Fully qualified object name
+                </li>
+
+                <li>
+                  Privilege - How the object is being used (<codeph>SELECT</codeph>, <codeph>INSERT</codeph>,
+                  <codeph>CREATE</codeph>, and so on)
+                </li>
+              </ul>
+            </li>
+          </ul>
+        </li>
+      </ul>
+
+<!-- Delegating actual examples to the Cloudera Navigator doc for the moment.
+<p>
+Here is an excerpt from a sample audit log file:
+</p>
+<codeblock></codeblock>
+-->
+    </conbody>
+  </concept>
+
+  <concept id="auditing_exceptions">
+
+    <title>Which Operations Are Audited</title>
+
+    <conbody>
+
+      <p>
+        The kinds of SQL queries represented in the audit log are:
+      </p>
+
+      <ul>
+        <li>
+          Queries that are prevented due to lack of authorization.
+        </li>
+
+        <li>
+          Queries that Impala can analyze and parse to determine that they are authorized. The audit data is
+          recorded immediately after Impala finishes its analysis, before the query is actually executed.
+        </li>
+      </ul>
+
+      <p>
+        The audit log does not contain entries for queries that could not be parsed and analyzed. For example, a
+        query that fails due to a syntax error is not recorded in the audit log. The audit log also does not
+        contain queries that fail due to a reference to a table that does not exist, if you would be authorized to
+        access the table if it did exist.
+      </p>
+
+      <p>
+        Certain statements in the <cmdname>impala-shell</cmdname> interpreter, such as <codeph>CONNECT</codeph>,
+        <codeph rev="1.4.0">SUMMARY</codeph>, <codeph>PROFILE</codeph>, <codeph>SET</codeph>, and
+        <codeph>QUIT</codeph>, do not correspond to actual SQL queries, and these statements are not reflected in
+        the audit log.
+      </p>
+    </conbody>
+  </concept>
+
+  <concept id="auditing_reviewing">
+
+    <title>Reviewing the Audit Logs</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Logs"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <p>
+        You typically do not review the audit logs in raw form. The Cloudera Manager Agent periodically transfers
+        the log information into a back-end database where it can be examined in consolidated form. See
+        <ph audience="standalone">the <xref href="http://www.cloudera.com/content/cloudera-content/cloudera-docs/Navigator/latest/Cloudera-Navigator-Installation-and-User-Guide/Cloudera-Navigator-Installation-and-User-Guide.html"
+            scope="external" format="html">Cloudera Navigator documentation</xref> for details</ph>
+            <xref href="cn_iu_audits.xml#cn_topic_7" audience="integrated" />.
+      </p>
+    </conbody>
+  </concept>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_authentication.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_authentication.xml b/docs/topics/impala_authentication.xml
new file mode 100644
index 0000000..7200e5f
--- /dev/null
+++ b/docs/topics/impala_authentication.xml
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="authentication">
+
+  <title>Impala Authentication</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Security"/>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Authentication"/>
+      <data name="Category" value="Administrators"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      Authentication is the mechanism to ensure that only specified hosts and users can connect to Impala. It also
+      verifies that when clients connect to Impala, they are connected to a legitimate server. This feature
+      prevents spoofing such as <term>impersonation</term> (setting up a phony client system with the same account
+      and group names as a legitimate user) and <term>man-in-the-middle attacks</term> (intercepting application
+      requests before they reach Impala and eavesdropping on sensitive information in the requests or the results).
+    </p>
+
+    <p>
+      Impala supports authentication using either Kerberos or LDAP.
+    </p>
+
+    <note conref="../shared/impala_common.xml#common/authentication_vs_authorization"/>
+
+    <p outputclass="toc"/>
+
+    <p>
+      Once you are finished setting up authentication, move on to authorization, which involves specifying what
+      databases, tables, HDFS directories, and so on can be accessed by particular users when they connect through
+      Impala. See <xref href="impala_authorization.xml#authorization"/> for details.
+    </p>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_cluster_sizing.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_cluster_sizing.xml b/docs/topics/impala_cluster_sizing.xml
new file mode 100644
index 0000000..382f68c
--- /dev/null
+++ b/docs/topics/impala_cluster_sizing.xml
@@ -0,0 +1,353 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="cluster_sizing">
+
+  <title>Cluster Sizing Guidelines for Impala</title>
+  <titlealts audience="PDF"><navtitle>Cluster Sizing</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Clusters"/>
+      <data name="Category" value="Planning"/>
+      <data name="Category" value="Sizing"/>
+      <data name="Category" value="Deploying"/>
+      <!-- Hoist by my own petard. Memory is an important theme of this topic but that's in a <section> title. -->
+      <data name="Category" value="Sectionated Pages"/>
+      <data name="Category" value="Memory"/>
+      <data name="Category" value="Scalability"/>
+      <data name="Category" value="Proof of Concept"/>
+      <data name="Category" value="Requirements"/>
+      <data name="Category" value="Guidelines"/>
+      <data name="Category" value="Best Practices"/>
+      <data name="Category" value="Administrators"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="Cloudera">cluster sizing</indexterm>
+      This document provides a very rough guideline to estimate the size of a cluster needed for a specific
+      customer application. You can use this information when planning how much and what type of hardware to
+      acquire for a new cluster, or when adding Impala workloads to an existing cluster.
+    </p>
+
+    <note>
+      Before making purchase or deployment decisions, consult your Cloudera representative to verify the
+      conclusions about hardware requirements based on your data volume and workload.
+    </note>
+
+<!--    <p outputclass="toc inpage"/> -->
+
+    <p>
+      Always use hosts with identical specifications and capacities for all the nodes in the cluster. Currently,
+      Impala divides the work evenly between cluster nodes, regardless of their exact hardware configuration.
+      Because work can be distributed in different ways for different queries, if some hosts are overloaded
+      compared to others in terms of CPU, memory, I/O, or network, you might experience inconsistent performance
+      and overall slowness
+    </p>
+
+    <p>
+      For analytic workloads with star/snowflake schemas, and using consistent hardware for all nodes (64 GB RAM,
+      12 2 TB hard drives, 2x E5-2630L 12 cores total, 10 GB network), the following table estimates the number of
+      DataNodes needed in the cluster based on data size and the number of concurrent queries, for workloads
+      similar to TPC-DS benchmark queries:
+    </p>
+
+    <table>
+      <title>Cluster size estimation based on the number of concurrent queries and data size with a 20 second average query response time</title>
+      <tgroup cols="6">
+        <colspec colnum="1" colname="col1"/>
+        <colspec colnum="2" colname="col2"/>
+        <colspec colnum="3" colname="col3"/>
+        <colspec colnum="4" colname="col4"/>
+        <colspec colnum="5" colname="col5"/>
+        <colspec colnum="6" colname="col6"/>
+        <thead>
+          <row>
+            <entry>
+              Data Size
+            </entry>
+            <entry>
+              1 query
+            </entry>
+            <entry>
+              10 queries
+            </entry>
+            <entry>
+              100 queries
+            </entry>
+            <entry>
+              1000 queries
+            </entry>
+            <entry>
+              2000 queries
+            </entry>
+          </row>
+        </thead>
+        <tbody>
+          <row>
+            <entry>
+              <b>250 GB</b>
+            </entry>
+            <entry>
+              2
+            </entry>
+            <entry>
+              2
+            </entry>
+            <entry>
+              5
+            </entry>
+            <entry>
+              35
+            </entry>
+            <entry>
+              70
+            </entry>
+          </row>
+          <row>
+            <entry>
+              <b>500 GB</b>
+            </entry>
+            <entry>
+              2
+            </entry>
+            <entry>
+              2
+            </entry>
+            <entry>
+              10
+            </entry>
+            <entry>
+              70
+            </entry>
+            <entry>
+              135
+            </entry>
+          </row>
+          <row>
+            <entry>
+              <b>1 TB</b>
+            </entry>
+            <entry>
+              2
+            </entry>
+            <entry>
+              2
+            </entry>
+            <entry>
+              15
+            </entry>
+            <entry>
+              135
+            </entry>
+            <entry>
+              270
+            </entry>
+          </row>
+          <row>
+            <entry>
+              <b>15 TB</b>
+            </entry>
+            <entry>
+              2
+            </entry>
+            <entry>
+              20
+            </entry>
+            <entry>
+              200
+            </entry>
+            <entry>
+              N/A
+            </entry>
+            <entry>
+              N/A
+            </entry>
+          </row>
+          <row>
+            <entry>
+              <b>30 TB</b>
+            </entry>
+            <entry>
+              4
+            </entry>
+            <entry>
+              40
+            </entry>
+            <entry>
+              400
+            </entry>
+            <entry>
+              N/A
+            </entry>
+            <entry>
+              N/A
+            </entry>
+          </row>
+          <row>
+            <entry>
+              <b>60 TB</b>
+            </entry>
+            <entry>
+              8
+            </entry>
+            <entry>
+              80
+            </entry>
+            <entry>
+              800
+            </entry>
+            <entry>
+              N/A
+            </entry>
+            <entry>
+              N/A
+            </entry>
+          </row>
+        </tbody>
+      </tgroup>
+    </table>
+
+    <section id="sizing_factors">
+
+      <title>Factors Affecting Scalability</title>
+
+      <p>
+        A typical analytic workload (TPC-DS style queries) using recommended hardware is usually CPU-bound. Each
+        node can process roughly 1.6 GB/sec. Both CPU-bound and disk-bound workloads can scale almost linearly with
+        cluster size. However, for some workloads, the scalability might be bounded by the network, or even by
+        memory.
+      </p>
+
+      <p>
+        If the workload is already network bound (on a 10 GB network), increasing the cluster size won\u2019t reduce
+        the network load; in fact, a larger cluster could increase network traffic because some queries involve
+        <q>broadcast</q> operations to all DataNodes. Therefore, boosting the cluster size does not improve query
+        throughput in a network-constrained environment.
+      </p>
+
+      <p>
+        Let\u2019s look at a memory-bound workload. A workload is memory-bound if Impala cannot run any additional
+        concurrent queries because all memory allocated has already been consumed, but neither CPU, disk, nor
+        network is saturated yet. This can happen because currently Impala uses only a single core per node to
+        process join and aggregation queries. For a node with 128 GB of RAM, if a join node takes 50 GB, the system
+        cannot run more than 2 such queries at the same time.
+      </p>
+
+      <p>
+        Therefore, at most 2 cores are used. Throughput can still scale almost linearly even for a memory-bound
+        workload. It\u2019s just that the CPU will not be saturated. Per-node throughput will be lower than 1.6
+        GB/sec. Consider increasing the memory per node.
+      </p>
+
+      <p>
+        As long as the workload is not network- or memory-bound, we can use the 1.6 GB/second per node as the
+        throughput estimate.
+      </p>
+    </section>
+
+    <section id="sizing_details">
+
+      <title>A More Precise Approach</title>
+
+      <p>
+        A more precise sizing estimate would require not only queries per minute (QPM), but also an average data
+        size scanned per query (D). With the proper partitioning strategy, D is usually a fraction of the total
+        data size. The following equation can be used as a rough guide to estimate the number of nodes (N) needed:
+      </p>
+
+<codeblock>Eq 1: N &gt; QPM * D / 100 GB
+</codeblock>
+
+      <p>
+        Here is an example. Suppose, on average, a query scans 50 GB of data and the average response time is
+        required to be 15 seconds or less when there are 100 concurrent queries. The QPM is 100/15*60 = 400. We can
+        estimate the number of node using our equation above.
+      </p>
+
+<codeblock>N &gt; QPM * D / 100GB
+N &gt; 400 * 50GB / 100GB
+N &gt; 200
+</codeblock>
+
+      <p>
+        Because this figure is a rough estimate, the corresponding number of nodes could be between 100 and 500.
+      </p>
+
+      <p>
+        Depending on the complexity of the query, the processing rate of query might change. If the query has more
+        joins, aggregation functions, or CPU-intensive functions such as string processing or complex UDFs, the
+        process rate will be lower than 1.6 GB/second per node. On the other hand, if the query only does scan and
+        filtering on numbers, the processing rate can be higher.
+      </p>
+    </section>
+
+    <section id="sizing_mem_estimate">
+
+      <title>Estimating Memory Requirements</title>
+      <!--
+  <prolog>
+    <metadata>
+      <data name="Category" value="Memory"/>
+    </metadata>
+  </prolog>
+      -->
+
+      <p>
+        Impala can handle joins between multiple large tables. Make sure that statistics are collected for all the
+        joined tables, using the <codeph><xref href="impala_compute_stats.xml#compute_stats">COMPUTE
+        STATS</xref></codeph> statement. However, joining big tables does consume more memory. Follow the steps
+        below to calculate the minimum memory requirement.
+      </p>
+
+      <p>
+        Suppose you are running the following join:
+      </p>
+
+<codeblock>select a.*, b.col_1, b.col_2, \u2026 b.col_n
+from a, b
+where a.key = b.key
+and b.col_1 in (1,2,4...)
+and b.col_4 in (....);
+</codeblock>
+
+      <p>
+        And suppose table <codeph>B</codeph> is smaller than table <codeph>A</codeph> (but still a large table).
+      </p>
+
+      <p>
+        The memory requirement for the query is the right-hand table (<codeph>B</codeph>), after decompression,
+        filtering (<codeph>b.col_n in ...</codeph>) and after projection (only using certain columns) must be less
+        than the total memory of the entire cluster.
+      </p>
+
+<codeblock>Cluster Total Memory Requirement  = Size of the smaller table *
+  selectivity factor from the predicate *
+  projection factor * compression ratio
+</codeblock>
+
+      <p>
+        In this case, assume that table <codeph>B</codeph> is 100 TB in Parquet format with 200 columns. The
+        predicate on <codeph>B</codeph> (<codeph>b.col_1 in ...and b.col_4 in ...</codeph>) will select only 10% of
+        the rows from <codeph>B</codeph> and for projection, we are only projecting 5 columns out of 200 columns.
+        Usually, Snappy compression gives us 3 times compression, so we estimate a 3x compression factor.
+      </p>
+
+<codeblock>Cluster Total Memory Requirement  = Size of the smaller table *
+  selectivity factor from the predicate *
+  projection factor * compression ratio
+  = 100TB * 10% * 5/200 * 3
+  = 0.75TB
+  = 750GB
+</codeblock>
+
+      <p>
+        So, if you have a 10-node cluster, each node has 128 GB of RAM and you give 80% to Impala, then you have 1
+        TB of usable memory for Impala, which is more than 750GB. Therefore, your cluster can handle join queries
+        of this magnitude.
+      </p>
+    </section>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_cm_installation.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_cm_installation.xml b/docs/topics/impala_cm_installation.xml
new file mode 100644
index 0000000..ff8325d
--- /dev/null
+++ b/docs/topics/impala_cm_installation.xml
@@ -0,0 +1,56 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="cm_installation">
+
+  <title>Installing Impala with Cloudera Manager</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Installing"/>
+      <data name="Category" value="Cloudera Manager"/>
+      <data name="Category" value="Administrators"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      Before installing Impala through the Cloudera Manager interface, make sure all applicable nodes have the
+      appropriate hardware configuration and levels of operating system and CDH. See
+      <xref href="impala_prereqs.xml#prereqs"/> for details.
+    </p>
+
+    <note rev="1.2.0">
+      <p rev="1.2.0">
+        To install the latest Impala under CDH 4, upgrade Cloudera Manager to 4.8 or higher. Cloudera Manager 4.8 is
+        the first release that can manage the Impala catalog service introduced in Impala 1.2. Cloudera Manager 4.8
+        requires this service to be present, so if you upgrade to Cloudera Manager 4.8, also upgrade Impala to the
+        most recent version at the same time.
+<!-- Not so relevant now for 1.1.1, but maybe someday we'll capture all this history in a compatibility grid.
+        Upgrade to Cloudera Manager 4.6.2 or higher to enable Cloudera Manager to
+        handle access control for the Impala web UI, available by default through
+        port 25000 on each Impala host.
+        -->
+      </p>
+    </note>
+
+    <p>
+      For information on installing Impala in a Cloudera Manager-managed environment, see
+      <xref audience="integrated" href="cm_ig_install_impala.xml"/><xref audience="standalone" href="http://www.cloudera.com/documentation/enterprise/latest/topics/cm_ig_install_impala.html" scope="external" format="html"/>.
+    </p>
+
+    <p>
+      Managing your Impala installation through Cloudera Manager has a number of advantages. For example, when you
+      make configuration changes to CDH components using Cloudera Manager, it automatically applies changes to the
+      copies of configuration files, such as <codeph>hive-site.xml</codeph>, that Impala keeps under
+      <filepath>/etc/impala/conf</filepath>. It also sets up the Hive Metastore service that is required for
+      Impala running under CDH 4.1.
+    </p>
+
+    <p>
+      In some cases, depending on the level of Impala, CDH, and Cloudera Manager, you might need to add particular
+      component configuration details in some of the free-form option fields on the Impala configuration pages
+      within Cloudera Manager. <ph conref="../shared/impala_common.xml#common/safety_valve"/>
+    </p>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_concepts.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_concepts.xml b/docs/topics/impala_concepts.xml
new file mode 100644
index 0000000..48b3637
--- /dev/null
+++ b/docs/topics/impala_concepts.xml
@@ -0,0 +1,295 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="concepts">
+
+  <title>Impala Concepts and Architecture</title>
+  <titlealts audience="PDF"><navtitle>Concepts and Architecture</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Concepts"/>
+      <data name="Category" value="Data Analysts"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Stub Pages"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+    <draft-comment author="-dita-use-conref-target"
+      conref="../shared/cdh_cm_common.xml#id_dgz_rhr_kv/draft-comment-test"/>
+    <p>
+      The following sections provide background information to help you become productive using Impala and
+      its features. Where appropriate, the explanations include context to help understand how aspects of Impala
+      relate to other technologies you might already be familiar with, such as relational database management
+      systems and data warehouses, or other Hadoop components such as Hive, HDFS, and HBase.
+    </p>
+
+    <p outputclass="toc"/>
+  </conbody>
+
+<!-- These other topics are waiting to be filled in. Could become subtopics or top-level topics depending on the depth of coverage in each case. -->
+
+  <concept id="intro_data_lifecycle" audience="Cloudera">
+
+    <title>Overview of the Data Lifecycle for Impala</title>
+
+    <conbody/>
+  </concept>
+
+  <concept id="intro_etl" audience="Cloudera">
+
+    <title>Overview of the Extract, Transform, Load (ETL) Process for Impala</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="ETL"/>
+      <data name="Category" value="Ingest"/>
+      <data name="Category" value="Concepts"/>
+    </metadata>
+  </prolog>
+
+    <conbody/>
+  </concept>
+
+  <concept id="intro_hadoop_data" audience="Cloudera">
+
+    <title>How Impala Works with Hadoop Data Files</title>
+
+    <conbody/>
+  </concept>
+
+  <concept id="intro_web_ui" audience="Cloudera">
+
+    <title>Overview of the Impala Web Interface</title>
+
+    <conbody/>
+  </concept>
+
+  <concept id="intro_bi" audience="Cloudera">
+
+    <title>Using Impala with Business Intelligence Tools</title>
+
+    <conbody/>
+  </concept>
+
+  <concept id="intro_ha" audience="Cloudera">
+
+    <title>Overview of Impala Availability and Fault Tolerance</title>
+
+    <conbody/>
+  </concept>
+
+<!-- This is pretty much ready to go. Decide if it should go under "Concepts" or "Performance",
+     and if it should be split out into a separate file, and then take out the audience= attribute
+     to make it visible.
+-->
+
+  <concept id="intro_llvm" audience="Cloudera">
+
+    <title>Overview of Impala Runtime Code Generation</title>
+
+    <conbody>
+
+<!-- Adapted from the CIDR15 paper written by the Impala team. -->
+
+      <p>
+        Impala uses <term>LLVM</term> (a compiler library and collection of related tools) to perform just-in-time
+        (JIT) compilation within the running <cmdname>impalad</cmdname> process. This runtime code generation
+        technique improves query execution times by generating native code optimized for the architecture of each
+        host in your particular cluster. Performance gains of 5 times or more are typical for representative
+        workloads.
+      </p>
+
+      <p>
+        Impala uses runtime code generation to produce query-specific versions of functions that are critical to
+        performance. In particular, code generation is applied to <term>inner loop</term> functions, that is, those
+        that are executed many times (for every tuple) in a given query, and thus constitute a large portion of the
+        total time the query takes to execute. For example, when Impala scans a data file, it calls a function to
+        parse each record into Impala\u2019s in-memory tuple format. For queries scanning large tables, billions of
+        records could result in billions of function calls. This function must therefore be extremely efficient for
+        good query performance, and removing even a few instructions from each function call can result in large
+        query speedups.
+      </p>
+
+      <p>
+        Overall, JIT compilation has an effect similar to writing custom code to process a query. For example, it
+        eliminates branches, unrolls loops, propagates constants, offsets and pointers, and inlines functions.
+        Inlining is especially valuable for functions used internally to evaluate expressions, where the function
+        call itself is more expensive than the function body (for example, a function that adds two numbers).
+        Inlining functions also increases instruction-level parallelism, and allows the compiler to make further
+        optimizations such as subexpression elimination across expressions.
+      </p>
+
+      <p>
+        Impala generates runtime query code automatically, so you do not need to do anything special to get this
+        performance benefit. This technique is most effective for complex and long-running queries that process
+        large numbers of rows. If you need to issue a series of short, small queries, you might turn off this
+        feature to avoid the overhead of compilation time for each query. In this case, issue the statement
+        <codeph>SET DISABLE_CODEGEN=true</codeph> to turn off runtime code generation for the duration of the
+        current session.
+      </p>
+
+<!--
+      <p>
+        Without code generation,
+        functions tend to be suboptimal
+        to handle situations that cannot be predicted in advance.
+        For example,
+        a record-parsing function that
+        only handles integer types will be faster at parsing an integer-only file
+        than a function that handles other data types
+        such as strings and floating-point numbers.
+        However, the schemas of the files to
+        be scanned are unknown at compile time,
+        and so a general-purpose function must be used, even if at runtime
+        it is known that more limited functionality is sufficient.
+      </p>
+
+      <p>
+        A source of large runtime overheads are virtual functions. Virtual function calls incur a large performance
+        penalty, particularly when the called function is very simple, as the calls cannot be inlined.
+        If the type of the object instance is known at runtime, we can use code generation to replace the virtual
+        function call with a call directly to the correct function, which can then be inlined. This is especially
+        valuable when evaluating expression trees. In Impala (as in many systems), expressions are composed of a
+        tree of individual operators and functions.
+      </p>
+
+      <p>
+        Each type of expression that can appear in a query is implemented internally by overriding a virtual function.
+        Many of these expression functions are quite simple, for example, adding two numbers.
+        The virtual function call can be more expensive than the function body itself. By resolving the virtual
+        function calls with code generation and then inlining the resulting function calls, Impala can evaluate expressions
+        directly with no function call overhead. Inlining functions also increases
+        instruction-level parallelism, and allows the compiler to make further optimizations such as subexpression
+        elimination across expressions.
+      </p>
+-->
+    </conbody>
+  </concept>
+
+<!-- Same as the previous section: adapted from CIDR paper, ready to externalize after deciding where to go. -->
+
+  <concept audience="Cloudera" id="intro_io">
+
+    <title>Overview of Impala I/O</title>
+
+    <conbody>
+
+      <p>
+        Efficiently retrieving data from HDFS is a challenge for all SQL-on-Hadoop systems. To perform
+        data scans from both disk and memory at or near hardware speed, Impala uses an HDFS feature called
+        <term>short-circuit local reads</term> to bypass the DataNode protocol when reading from local disk. Impala
+        can read at almost disk bandwidth (approximately 100 MB/s per disk) and is typically able to saturate all
+        available disks. For example, with 12 disks, Impala is typically capable of sustaining I/O at 1.2 GB/sec.
+        Furthermore, <term>HDFS caching</term> allows Impala to access memory-resident data at memory bus speed,
+        and saves CPU cycles as there is no need to copy or checksum data blocks within memory.
+      </p>
+
+      <p>
+        The I/O manager component interfaces with storage devices to read and write data. I/O manager assigns a
+        fixed number of worker threads per physical disk (currently one thread per rotational disk and eight per
+        SSD), providing an asynchronous interface to clients (<term>scanner threads</term>).
+      </p>
+    </conbody>
+  </concept>
+
+<!-- Same as the previous section: adapted from CIDR paper, ready to externalize after deciding where to go. -->
+
+<!-- Although good idea to get some answers from Henry first. -->
+
+  <concept audience="Cloudera" id="intro_state_distribution">
+
+    <title>State distribution</title>
+
+    <conbody>
+
+      <p>
+        As a massively parallel database that can run on hundreds of nodes, Impala must coordinate and synchronize
+        its metadata across the entire cluster. Impala's symmetric-node architecture means that any node can accept
+        and execute queries, and thus each node needs up-to-date versions of the system catalog and a knowledge of
+        which hosts the <cmdname>impalad</cmdname> daemons run on. To avoid the overhead of TCP connections and
+        remote procedure calls to retrieve metadata during query planning, Impala implements a simple
+        publish-subscribe service called the <term>statestore</term> to push metadata changes to a set of
+        subscribers (the <cmdname>impalad</cmdname> daemons running on all the DataNodes).
+      </p>
+
+      <p>
+        The statestore maintains a set of topics, which are arrays of <codeph>(<varname>key</varname>,
+        <varname>value</varname>, <varname>version</varname>)</codeph> triplets called <term>entries</term> where
+        <varname>key</varname> and <varname>value</varname> are byte arrays, and <varname>version</varname> is a
+        64-bit integer. A topic is defined by an application, and so the statestore has no understanding of the
+        contents of any topic entry. Topics are persistent through the lifetime of the statestore, but are not
+        persisted across service restarts. Processes that receive updates to any topic are called
+        <term>subscribers</term>, and express their interest by registering with the statestore at startup and
+        providing a list of topics. The statestore responds to registration by sending the subscriber an initial
+        topic update for each registered topic, which consists of all the entries currently in that topic.
+      </p>
+
+<!-- Henry: OK, but in practice, what is in these topic messages for Impala? -->
+
+      <p>
+        After registration, the statestore periodically sends two kinds of messages to each subscriber. The first
+        kind of message is a topic update, and consists of all changes to a topic (new entries, modified entries
+        and deletions) since the last update was successfully sent to the subscriber. Each subscriber maintains a
+        per-topic most-recent-version identifier which allows the statestore to only send the delta between
+        updates. In response to a topic update, each subscriber sends a list of changes it intends to make to its
+        subscribed topics. Those changes are guaranteed to have been applied by the time the next update is
+        received.
+      </p>
+
+      <p>
+        The second kind of statestore message is a <term>heartbeat</term>, formerly sometimes called
+        <term>keepalive</term>. The statestore uses heartbeat messages to maintain the connection to each
+        subscriber, which would otherwise time out its subscription and attempt to re-register.
+      </p>
+
+      <p>
+        Prior to Impala 2.0, both kinds of communication were combined in a single kind of message. Because these
+        messages could be very large in instances with thousands of tables, partitions, data files, and so on,
+        Impala 2.0 and higher divides the types of messages so that the small heartbeat pings can be transmitted
+        and acknowledged quickly, increasing the reliability of the statestore mechanism that detects when Impala
+        nodes become unavailable.
+      </p>
+
+      <p>
+        If the statestore detects a failed subscriber (for example, by repeated failed heartbeat deliveries), it
+        stops sending updates to that node.
+<!-- Henry: what are examples of these transient topic entries? -->
+        Some topic entries are marked as transient, meaning that if their owning subscriber fails, they are
+        removed.
+      </p>
+
+      <p>
+        Although the asynchronous nature of this mechanism means that metadata updates might take some time to
+        propagate across the entire cluster, that does not affect the consistency of query planning or results.
+        Each query is planned and coordinated by a particular node, so as long as the coordinator node is aware of
+        the existence of the relevant tables, data files, and so on, it can distribute the query work to other
+        nodes even if those other nodes have not received the latest metadata updates.
+<!-- Henry: need another example here of what's in a topic, e.g. is it the list of available tables? -->
+<!--
+        For example, query planning is performed on a single node based on the
+        catalog metadata topic, and once a full plan has been computed, all information required to execute that
+        plan is distributed directly to the executing nodes.
+        There is no requirement that an executing node should
+        know about the same version of the catalog metadata topic.
+-->
+      </p>
+
+      <p>
+        We have found that the statestore process with default settings scales well to medium sized clusters, and
+        can serve our largest deployments with some configuration changes.
+<!-- Henry: elaborate on the configuration changes. -->
+      </p>
+
+      <p>
+<!-- Henry: other examples like load information? How is load information used? -->
+        The statestore does not persist any metadata to disk: all current metadata is pushed to the statestore by
+        its subscribers (for example, load information). Therefore, should a statestore restart, its state can be
+        recovered during the initial subscriber registration phase. Or if the machine that the statestore is
+        running on fails, a new statestore process can be started elsewhere, and subscribers can fail over to it.
+        There is no built-in failover mechanism in Impala, instead deployments commonly use a retargetable DNS
+        entry to force subscribers to automatically move to the new process instance.
+<!-- Henry: translate that last sentence into instructions / guidelines. -->
+      </p>
+    </conbody>
+  </concept>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_config.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_config.xml b/docs/topics/impala_config.xml
new file mode 100644
index 0000000..7ea82e5
--- /dev/null
+++ b/docs/topics/impala_config.xml
@@ -0,0 +1,57 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="config">
+
+  <title>Managing Impala</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Administrators"/>
+      <data name="Category" value="Configuring"/>
+      <data name="Category" value="JDBC"/>
+      <data name="Category" value="ODBC"/>
+      <data name="Category" value="Stub Pages"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      This section explains how to configure Impala to accept connections from applications that use popular
+      programming APIs:
+    </p>
+
+    <ul>
+      <li>
+        <xref href="impala_config_performance.xml#config_performance"/>
+      </li>
+
+      <li>
+        <xref href="impala_odbc.xml#impala_odbc"/>
+      </li>
+
+      <li>
+        <xref href="impala_jdbc.xml#impala_jdbc"/>
+      </li>
+    </ul>
+
+    <p>
+      This type of configuration is especially useful when using Impala in combination with Business Intelligence
+      tools, which use these standard interfaces to query different kinds of database and Big Data systems.
+    </p>
+
+    <p>
+      You can also configure these other aspects of Impala:
+    </p>
+
+    <ul>
+      <li>
+        <xref href="impala_security.xml#security"/>
+      </li>
+
+      <li>
+        <xref href="impala_config_options.xml#config_options"/>
+      </li>
+    </ul>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_config_options.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_config_options.xml b/docs/topics/impala_config_options.xml
new file mode 100644
index 0000000..03f07d2
--- /dev/null
+++ b/docs/topics/impala_config_options.xml
@@ -0,0 +1,593 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="config_options">
+
+  <title>Modifying Impala Startup Options</title>
+
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Configuring"/>
+      <data name="Category" value="Administrators"/>
+      <data name="Category" value="Developers"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="Cloudera">defaults file</indexterm>
+
+      <indexterm audience="Cloudera">configuration file</indexterm>
+
+      <indexterm audience="Cloudera">options</indexterm>
+
+      <indexterm audience="Cloudera">IMPALA_STATE_STORE_PORT</indexterm>
+
+      <indexterm audience="Cloudera">IMPALA_BACKEND_PORT</indexterm>
+
+      <indexterm audience="Cloudera">IMPALA_LOG_DIR</indexterm>
+
+      <indexterm audience="Cloudera">IMPALA_STATE_STORE_ARGS</indexterm>
+
+      <indexterm audience="Cloudera">IMPALA_SERVER_ARGS</indexterm>
+
+      <indexterm audience="Cloudera">ENABLE_CORE_DUMPS</indexterm>
+
+      <indexterm audience="Cloudera">core dumps</indexterm>
+
+      <indexterm audience="Cloudera">restarting services</indexterm>
+
+      <indexterm audience="Cloudera">services</indexterm>
+      The configuration options for the Impala-related daemons let you choose which hosts and
+      ports to use for the services that run on a single host, specify directories for logging,
+      control resource usage and security, and specify other aspects of the Impala software.
+    </p>
+
+    <p outputclass="toc inpage"/>
+
+  </conbody>
+
+  <concept id="config_options_cm">
+
+    <title>Configuring Impala Startup Options through Cloudera Manager</title>
+
+    <conbody>
+
+      <p>
+        If you manage your cluster through Cloudera Manager, configure the settings for all the
+        Impala-related daemons by navigating to this page:
+        <menucascade><uicontrol>Clusters</uicontrol><uicontrol>Impala</uicontrol><uicontrol>Configuration</uicontrol><uicontrol>View
+        and Edit</uicontrol></menucascade>. See the Cloudera Manager documentation for
+        <xref href="http://www.cloudera.com/documentation/enterprise/latest/topics/cm_mc_impala_service.html" scope="external" format="html">instructions
+        about how to configure Impala through Cloudera Manager</xref>.
+      </p>
+
+      <p>
+        If the Cloudera Manager interface does not yet have a form field for a newly added
+        option, or if you need to use special options for debugging and troubleshooting, the
+        <uicontrol>Advanced</uicontrol> option page for each daemon includes one or more fields
+        where you can enter option names directly.
+        <ph conref="../shared/impala_common.xml#common/safety_valve"/> There is also a free-form
+        field for query options, on the top-level <uicontrol>Impala Daemon</uicontrol> options
+        page.
+      </p>
+
+    </conbody>
+
+  </concept>
+
+  <concept id="config_options_noncm">
+
+    <title>Configuring Impala Startup Options through the Command Line</title>
+
+    <conbody>
+
+      <p>
+        When you run Impala in a non-Cloudera Manager environment, the Impala server,
+        statestore, and catalog services start up using values provided in a defaults file,
+        <filepath>/etc/default/impala</filepath>.
+      </p>
+
+      <p>
+        This file includes information about many resources used by Impala. Most of the defaults
+        included in this file should be effective in most cases. For example, typically you
+        would not change the definition of the <codeph>CLASSPATH</codeph> variable, but you
+        would always set the address used by the statestore server. Some of the content you
+        might modify includes:
+      </p>
+
+<!-- Note: Update the following example for each release with the associated lines from /etc/default/impala
+           from a non-CM-managed system. -->
+
+<codeblock rev="ver">IMPALA_STATE_STORE_HOST=127.0.0.1
+IMPALA_STATE_STORE_PORT=24000
+IMPALA_BACKEND_PORT=22000
+IMPALA_LOG_DIR=/var/log/impala
+IMPALA_CATALOG_SERVICE_HOST=...
+IMPALA_STATE_STORE_HOST=...
+
+export IMPALA_STATE_STORE_ARGS=${IMPALA_STATE_STORE_ARGS:- \
+    -log_dir=${IMPALA_LOG_DIR} -state_store_port=${IMPALA_STATE_STORE_PORT}}
+IMPALA_SERVER_ARGS=" \
+-log_dir=${IMPALA_LOG_DIR} \
+-catalog_service_host=${IMPALA_CATALOG_SERVICE_HOST} \
+-state_store_port=${IMPALA_STATE_STORE_PORT} \
+-use_statestore \
+-state_store_host=${IMPALA_STATE_STORE_HOST} \
+-be_port=${IMPALA_BACKEND_PORT}"
+export ENABLE_CORE_DUMPS=${ENABLE_COREDUMPS:-false}</codeblock>
+
+      <p>
+        To use alternate values, edit the defaults file, then restart all the Impala-related
+        services so that the changes take effect. Restart the Impala server using the following
+        commands:
+      </p>
+
+<codeblock>$ sudo service impala-server restart
+Stopping Impala Server:                                    [  OK  ]
+Starting Impala Server:                                    [  OK  ]</codeblock>
+
+      <p>
+        Restart the Impala statestore using the following commands:
+      </p>
+
+<codeblock>$ sudo service impala-state-store restart
+Stopping Impala State Store Server:                        [  OK  ]
+Starting Impala State Store Server:                        [  OK  ]</codeblock>
+
+      <p>
+        Restart the Impala catalog service using the following commands:
+      </p>
+
+<codeblock>$ sudo service impala-catalog restart
+Stopping Impala Catalog Server:                            [  OK  ]
+Starting Impala Catalog Server:                            [  OK  ]</codeblock>
+
+      <p>
+        Some common settings to change include:
+      </p>
+
+      <ul>
+        <li>
+          <p>
+            Statestore address. Cloudera recommends the statestore be on a separate host not
+            running the <cmdname>impalad</cmdname> daemon. In that recommended configuration,
+            the <cmdname>impalad</cmdname> daemon cannot refer to the statestore server using
+            the loopback address. If the statestore is hosted on a machine with an IP address of
+            192.168.0.27, change:
+          </p>
+<codeblock>IMPALA_STATE_STORE_HOST=127.0.0.1</codeblock>
+          <p>
+            to:
+          </p>
+<codeblock>IMPALA_STATE_STORE_HOST=192.168.0.27</codeblock>
+        </li>
+
+        <li rev="1.2">
+          <p>
+            Catalog server address (including both the hostname and the port number). Update the
+            value of the <codeph>IMPALA_CATALOG_SERVICE_HOST</codeph> variable. Cloudera
+            recommends the catalog server be on the same host as the statestore. In that
+            recommended configuration, the <cmdname>impalad</cmdname> daemon cannot refer to the
+            catalog server using the loopback address. If the catalog service is hosted on a
+            machine with an IP address of 192.168.0.27, add the following line:
+          </p>
+<codeblock>IMPALA_CATALOG_SERVICE_HOST=192.168.0.27:26000</codeblock>
+          <p>
+            The <filepath>/etc/default/impala</filepath> defaults file currently does not define
+            an <codeph>IMPALA_CATALOG_ARGS</codeph> environment variable, but if you add one it
+            will be recognized by the service startup/shutdown script. Add a definition for this
+            variable to <filepath>/etc/default/impala</filepath> and add the option
+            <codeph>-catalog_service_host=<varname>hostname</varname></codeph>. If the port is
+            different than the default 26000, also add the option
+            <codeph>-catalog_service_port=<varname>port</varname></codeph>.
+          </p>
+        </li>
+
+        <li id="mem_limit">
+          <p>
+            Memory limits. You can limit the amount of memory available to Impala. For example,
+            to allow Impala to use no more than 70% of system memory, change:
+          </p>
+<!-- Note: also needs to be updated for each release to reflect latest /etc/default/impala. -->
+<codeblock>export IMPALA_SERVER_ARGS=${IMPALA_SERVER_ARGS:- \
+    -log_dir=${IMPALA_LOG_DIR} \
+    -state_store_port=${IMPALA_STATE_STORE_PORT} \
+    -use_statestore -state_store_host=${IMPALA_STATE_STORE_HOST} \
+    -be_port=${IMPALA_BACKEND_PORT}}</codeblock>
+          <p>
+            to:
+          </p>
+<codeblock>export IMPALA_SERVER_ARGS=${IMPALA_SERVER_ARGS:- \
+    -log_dir=${IMPALA_LOG_DIR} -state_store_port=${IMPALA_STATE_STORE_PORT} \
+    -use_statestore -state_store_host=${IMPALA_STATE_STORE_HOST} \
+    -be_port=${IMPALA_BACKEND_PORT} -mem_limit=70%}</codeblock>
+          <p>
+            You can specify the memory limit using absolute notation such as
+            <codeph>500m</codeph> or <codeph>2G</codeph>, or as a percentage of physical memory
+            such as <codeph>60%</codeph>.
+          </p>
+
+          <note>
+            Queries that exceed the specified memory limit are aborted. Percentage limits are
+            based on the physical memory of the machine and do not consider cgroups.
+          </note>
+        </li>
+
+        <li>
+          <p>
+            Core dump enablement. To enable core dumps on systems not managed by Cloudera
+            Manager, change:
+          </p>
+<codeblock>export ENABLE_CORE_DUMPS=${ENABLE_COREDUMPS:-false}</codeblock>
+          <p>
+            to:
+          </p>
+<codeblock>export ENABLE_CORE_DUMPS=${ENABLE_COREDUMPS:-true}</codeblock>
+          <p>
+            On systems managed by Cloudera Manager, enable the <uicontrol>Enable Core
+            Dump</uicontrol> setting for the Impala service.
+          </p>
+
+          <note conref="../shared/impala_common.xml#common/core_dump_considerations"/>
+        </li>
+
+        <li>
+          <p>
+            Authorization using the open source Sentry plugin. Specify the
+            <codeph>-server_name</codeph> and <codeph>-authorization_policy_file</codeph>
+            options as part of the <codeph>IMPALA_SERVER_ARGS</codeph> and
+            <codeph>IMPALA_STATE_STORE_ARGS</codeph> settings to enable the core Impala support
+            for authentication. See <xref href="impala_authorization.xml#secure_startup"/> for
+            details.
+          </p>
+        </li>
+
+        <li>
+          <p>
+            Auditing for successful or blocked Impala queries, another aspect of security.
+            Specify the <codeph>-audit_event_log_dir=<varname>directory_path</varname></codeph>
+            option and optionally the
+            <codeph>-max_audit_event_log_file_size=<varname>number_of_queries</varname></codeph>
+            and <codeph>-abort_on_failed_audit_event</codeph> options as part of the
+            <codeph>IMPALA_SERVER_ARGS</codeph> settings, for each Impala node, to enable and
+            customize auditing. See <xref href="impala_auditing.xml#auditing"/> for details.
+          </p>
+        </li>
+
+        <li>
+          <p>
+            Password protection for the Impala web UI, which listens on port 25000 by default.
+            This feature involves adding some or all of the
+            <codeph>--webserver_password_file</codeph>,
+            <codeph>--webserver_authentication_domain</codeph>, and
+            <codeph>--webserver_certificate_file</codeph> options to the
+            <codeph>IMPALA_SERVER_ARGS</codeph> and <codeph>IMPALA_STATE_STORE_ARGS</codeph>
+            settings. See <xref href="impala_security_guidelines.xml#security_guidelines"/> for
+            details.
+          </p>
+        </li>
+
+        <li id="default_query_options">
+          <p rev="DOCS-677">
+            Another setting you might add to <codeph>IMPALA_SERVER_ARGS</codeph> is a
+            comma-separated list of query options and values:
+<codeblock>-default_query_options='<varname>option</varname>=<varname>value</varname>,<varname>option</varname>=<varname>value</varname>,...'
+</codeblock>
+            These options control the behavior of queries performed by this
+            <cmdname>impalad</cmdname> instance. The option values you specify here override the
+            default values for <xref href="impala_query_options.xml#query_options">Impala query
+            options</xref>, as shown by the <codeph>SET</codeph> statement in
+            <cmdname>impala-shell</cmdname>.
+          </p>
+        </li>
+
+<!-- Removing this reference now that the options are de-emphasized / desupported in CDH 5.5 / Impala 2.3 and up.
+        <li rev="1.2">
+          <p>
+          Options for resource management, in conjunction with the YARN component. These options include
+          <codeph>-enable_rm</codeph> and <codeph>-cgroup_hierarchy_path</codeph>.
+          <ph rev="1.4.0">Additional options to help fine-tune the resource estimates are
+          <codeph>-\u2014rm_always_use_defaults</codeph>,
+          <codeph>-\u2014rm_default_memory=<varname>size</varname></codeph>, and
+          <codeph>-\u2014rm_default_cpu_cores</codeph>.</ph> For details about these options, see
+          <xref href="impala_resource_management.xml#rm_options"/>. See
+          <xref href="impala_resource_management.xml#resource_management"/> for information about resource
+          management in general.
+          </p>
+        </li>
+-->
+
+        <li>
+          <p>
+            During troubleshooting, Cloudera Support might direct you to change other values,
+            particularly for <codeph>IMPALA_SERVER_ARGS</codeph>, to work around issues or
+            gather debugging information.
+          </p>
+        </li>
+      </ul>
+
+<!-- Removing this reference now that the options are de-emphasized / desupported in CDH 5.5 / Impala 2.3 and up.
+      <p conref="impala_resource_management.xml#rm_options/resource_management_impalad_options"/>
+-->
+
+      <note>
+        <p>
+          These startup options for the <cmdname>impalad</cmdname> daemon are different from the
+          command-line options for the <cmdname>impala-shell</cmdname> command. For the
+          <cmdname>impala-shell</cmdname> options, see
+          <xref href="impala_shell_options.xml#shell_options"/>.
+        </p>
+      </note>
+
+      <p audience="Cloudera" outputclass="toc inpage"/>
+
+    </conbody>
+
+    <concept audience="Cloudera" id="config_options_impalad_details">
+
+      <title>Configuration Options for impalad Daemon</title>
+
+      <conbody>
+
+        <p>
+          Some common settings to change include:
+        </p>
+
+        <ul>
+          <li>
+            <p>
+              Statestore address. Cloudera recommends the statestore be on a separate host not
+              running the <cmdname>impalad</cmdname> daemon. In that recommended configuration,
+              the <cmdname>impalad</cmdname> daemon cannot refer to the statestore server using
+              the loopback address. If the statestore is hosted on a machine with an IP address
+              of 192.168.0.27, change:
+            </p>
+<codeblock>IMPALA_STATE_STORE_HOST=127.0.0.1</codeblock>
+            <p>
+              to:
+            </p>
+<codeblock>IMPALA_STATE_STORE_HOST=192.168.0.27</codeblock>
+          </li>
+
+          <li rev="1.2">
+            <p>
+              Catalog server address. Update the <codeph>IMPALA_CATALOG_SERVICE_HOST</codeph>
+              variable, including both the hostname and the port number in the value. Cloudera
+              recommends the catalog server be on the same host as the statestore. In that
+              recommended configuration, the <cmdname>impalad</cmdname> daemon cannot refer to
+              the catalog server using the loopback address. If the catalog service is hosted on
+              a machine with an IP address of 192.168.0.27, add the following line:
+            </p>
+<codeblock>IMPALA_CATALOG_SERVICE_HOST=192.168.0.27:26000</codeblock>
+            <p>
+              The <filepath>/etc/default/impala</filepath> defaults file currently does not
+              define an <codeph>IMPALA_CATALOG_ARGS</codeph> environment variable, but if you
+              add one it will be recognized by the service startup/shutdown script. Add a
+              definition for this variable to <filepath>/etc/default/impala</filepath> and add
+              the option <codeph>-catalog_service_host=<varname>hostname</varname></codeph>. If
+              the port is different than the default 26000, also add the option
+              <codeph>-catalog_service_port=<varname>port</varname></codeph>.
+            </p>
+          </li>
+
+          <li id="mem_limit">
+            Memory limits. You can limit the amount of memory available to Impala. For example,
+            to allow Impala to use no more than 70% of system memory, change:
+<!-- Note: also needs to be updated for each release to reflect latest /etc/default/impala. -->
+<codeblock>export IMPALA_SERVER_ARGS=${IMPALA_SERVER_ARGS:- \
+    -log_dir=${IMPALA_LOG_DIR} \
+    -state_store_port=${IMPALA_STATE_STORE_PORT} \
+    -use_statestore -state_store_host=${IMPALA_STATE_STORE_HOST} \
+    -be_port=${IMPALA_BACKEND_PORT}}</codeblock>
+            <p>
+              to:
+            </p>
+<codeblock>export IMPALA_SERVER_ARGS=${IMPALA_SERVER_ARGS:- \
+    -log_dir=${IMPALA_LOG_DIR} -state_store_port=${IMPALA_STATE_STORE_PORT} \
+    -use_statestore -state_store_host=${IMPALA_STATE_STORE_HOST} \
+    -be_port=${IMPALA_BACKEND_PORT} -mem_limit=70%}</codeblock>
+            <p>
+              You can specify the memory limit using absolute notation such as
+              <codeph>500m</codeph> or <codeph>2G</codeph>, or as a percentage of physical
+              memory such as <codeph>60%</codeph>.
+            </p>
+
+            <note>
+              Queries that exceed the specified memory limit are aborted. Percentage limits are
+              based on the physical memory of the machine and do not consider cgroups.
+            </note>
+          </li>
+
+          <li>
+            Core dump enablement. To enable core dumps, change:
+<codeblock>export ENABLE_CORE_DUMPS=${ENABLE_COREDUMPS:-false}</codeblock>
+            <p>
+              to:
+            </p>
+<codeblock>export ENABLE_CORE_DUMPS=${ENABLE_COREDUMPS:-true}</codeblock>
+            <note>
+              The location of core dump files may vary according to your operating system
+              configuration. Other security settings may prevent Impala from writing core dumps
+              even when this option is enabled.
+            </note>
+          </li>
+
+          <li>
+            Authorization using the open source Sentry plugin. Specify the
+            <codeph>-server_name</codeph> and <codeph>-authorization_policy_file</codeph>
+            options as part of the <codeph>IMPALA_SERVER_ARGS</codeph> and
+            <codeph>IMPALA_STATE_STORE_ARGS</codeph> settings to enable the core Impala support
+            for authentication. See <xref href="impala_authorization.xml#secure_startup"/> for
+            details.
+          </li>
+
+          <li>
+            Auditing for successful or blocked Impala queries, another aspect of security.
+            Specify the <codeph>-audit_event_log_dir=<varname>directory_path</varname></codeph>
+            option and optionally the
+            <codeph>-max_audit_event_log_file_size=<varname>number_of_queries</varname></codeph>
+            and <codeph>-abort_on_failed_audit_event</codeph> options as part of the
+            <codeph>IMPALA_SERVER_ARGS</codeph> settings, for each Impala node, to enable and
+            customize auditing. See <xref href="impala_auditing.xml#auditing"/> for details.
+          </li>
+
+          <li>
+            Password protection for the Impala web UI, which listens on port 25000 by default.
+            This feature involves adding some or all of the
+            <codeph>--webserver_password_file</codeph>,
+            <codeph>--webserver_authentication_domain</codeph>, and
+            <codeph>--webserver_certificate_file</codeph> options to the
+            <codeph>IMPALA_SERVER_ARGS</codeph> and <codeph>IMPALA_STATE_STORE_ARGS</codeph>
+            settings. See <xref href="impala_security_webui.xml"/> for details.
+          </li>
+
+          <li id="default_query_options">
+            Another setting you might add to <codeph>IMPALA_SERVER_ARGS</codeph> is:
+<codeblock>-default_query_options='<varname>option</varname>=<varname>value</varname>,<varname>option</varname>=<varname>value</varname>,...'
+</codeblock>
+            These options control the behavior of queries performed by this
+            <cmdname>impalad</cmdname> instance. The option values you specify here override the
+            default values for <xref href="impala_query_options.xml#query_options">Impala query
+            options</xref>, as shown by the <codeph>SET</codeph> statement in
+            <cmdname>impala-shell</cmdname>.
+          </li>
+
+<!-- Removing this reference now that the options are de-emphasized / desupported in CDH 5.5 / Impala 2.3 and up.
+          <li rev="1.2">
+            Options for resource management, in conjunction with the YARN component. These options
+            include <codeph>-enable_rm</codeph> and <codeph>-cgroup_hierarchy_path</codeph>.
+            <ph rev="1.4.0">Additional options to help fine-tune the resource estimates are
+            <codeph>-\u2014rm_always_use_defaults</codeph>,
+            <codeph>-\u2014rm_default_memory=<varname>size</varname></codeph>, and
+            <codeph>-\u2014rm_default_cpu_cores</codeph>.</ph> For details about these options, see
+            <xref href="impala_resource_management.xml#rm_options"/>. See
+            <xref href="impala_resource_management.xml#resource_management"/> for information about resource
+            management in general.
+          </li>
+-->
+
+          <li>
+            During troubleshooting, Cloudera Support might direct you to change other values,
+            particularly for <codeph>IMPALA_SERVER_ARGS</codeph>, to work around issues or
+            gather debugging information.
+          </li>
+        </ul>
+
+<!-- Removing this reference now that the options are de-emphasized / desupported in CDH 5.5 / Impala 2.3 and up.
+        <p conref="impala_resource_management.xml#rm_options/resource_management_impalad_options"/>
+-->
+
+        <note>
+          <p>
+            These startup options for the <cmdname>impalad</cmdname> daemon are different from
+            the command-line options for the <cmdname>impala-shell</cmdname> command. For the
+            <cmdname>impala-shell</cmdname> options, see
+            <xref href="impala_shell_options.xml#shell_options"/>.
+          </p>
+        </note>
+
+      </conbody>
+
+    </concept>
+
+    <concept audience="Cloudera" id="config_options_statestored_details">
+
+      <title>Configuration Options for statestored Daemon</title>
+
+      <conbody>
+
+        <p></p>
+
+      </conbody>
+
+    </concept>
+
+    <concept audience="Cloudera" id="config_options_catalogd_details">
+
+      <title>Configuration Options for catalogd Daemon</title>
+
+      <conbody>
+
+        <p></p>
+
+      </conbody>
+
+    </concept>
+
+  </concept>
+
+  <concept id="config_options_checking">
+
+    <title>Checking the Values of Impala Configuration Options</title>
+
+    <conbody>
+
+      <p>
+        You can check the current runtime value of all these settings through the Impala web
+        interface, available by default at
+        <codeph>http://<varname>impala_hostname</varname>:25000/varz</codeph> for the
+        <cmdname>impalad</cmdname> daemon,
+        <codeph>http://<varname>impala_hostname</varname>:25010/varz</codeph> for the
+        <cmdname>statestored</cmdname> daemon, or
+        <codeph>http://<varname>impala_hostname</varname>:25020/varz</codeph> for the
+        <cmdname>catalogd</cmdname> daemon. In the Cloudera Manager interface, you can see the
+        link to the appropriate <uicontrol><varname>service_name</varname> Web UI</uicontrol>
+        page when you look at the status page for a specific daemon on a specific host.
+      </p>
+
+    </conbody>
+
+  </concept>
+
+  <concept id="config_options_impalad">
+
+    <title>Startup Options for impalad Daemon</title>
+
+    <conbody>
+
+      <p>
+        The <codeph>impalad</codeph> daemon implements the main Impala service, which performs
+        query processing and reads and writes the data files.
+      </p>
+
+    </conbody>
+
+  </concept>
+
+  <concept id="config_options_statestored">
+
+    <title>Startup Options for statestored Daemon</title>
+
+    <conbody>
+
+      <p>
+        The <cmdname>statestored</cmdname> daemon implements the Impala statestore service,
+        which monitors the availability of Impala services across the cluster, and handles
+        situations such as nodes becoming unavailable or becoming available again.
+      </p>
+
+    </conbody>
+
+  </concept>
+
+  <concept rev="1.2" id="config_options_catalogd">
+
+    <title>Startup Options for catalogd Daemon</title>
+
+    <conbody>
+
+      <p>
+        The <cmdname>catalogd</cmdname> daemon implements the Impala catalog service, which
+        broadcasts metadata changes to all the Impala nodes when Impala creates a table, inserts
+        data, or performs other kinds of DDL and DML operations.
+      </p>
+
+      <p conref="../shared/impala_common.xml#common/load_catalog_in_background"/>
+
+    </conbody>
+
+  </concept>
+
+</concept>