You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@sqoop.apache.org by gw...@apache.org on 2015/04/09 06:55:50 UTC

[1/4] sqoop git commit: SQOOP-2283: Support usage of --exec and --password-alias

Repository: sqoop
Updated Branches:
  refs/heads/branch-1.4.6 d2f6c84d3 -> 739733853


SQOOP-2283: Support usage of --exec and --password-alias

(Abraham Elmahrek via Gwen Shapira)

(cherry picked from commit d32137f15c06827dea50b293f4d422481cb374cd)


Project: http://git-wip-us.apache.org/repos/asf/sqoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/sqoop/commit/621be4a6
Tree: http://git-wip-us.apache.org/repos/asf/sqoop/tree/621be4a6
Diff: http://git-wip-us.apache.org/repos/asf/sqoop/diff/621be4a6

Branch: refs/heads/branch-1.4.6
Commit: 621be4a6b7f49020f67ac738249bd04779ad00d1
Parents: d2f6c84
Author: Gwen Shapira <cs...@gmail.com>
Authored: Tue Apr 7 19:15:59 2015 -0700
Committer: Gwen Shapira <cs...@gmail.com>
Committed: Wed Apr 8 21:53:23 2015 -0700

----------------------------------------------------------------------
 src/java/org/apache/sqoop/SqoopOptions.java | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/sqoop/blob/621be4a6/src/java/org/apache/sqoop/SqoopOptions.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/sqoop/SqoopOptions.java b/src/java/org/apache/sqoop/SqoopOptions.java
index fc3d847..e4eb2f2 100644
--- a/src/java/org/apache/sqoop/SqoopOptions.java
+++ b/src/java/org/apache/sqoop/SqoopOptions.java
@@ -33,6 +33,7 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.sqoop.accumulo.AccumuloConstants;
 import org.apache.sqoop.util.CredentialsUtil;
 import org.apache.sqoop.util.LoggingUtils;
+import org.apache.sqoop.util.password.CredentialProviderHelper;
 import org.apache.sqoop.validation.AbortOnFailureHandler;
 import org.apache.sqoop.validation.AbsoluteValidationThreshold;
 import org.apache.sqoop.validation.RowCountValidator;
@@ -677,6 +678,16 @@ public class SqoopOptions implements Cloneable {
       }
     }
 
+    passwordAlias = props.getProperty("db.password.alias");
+    if (passwordAlias != null) {
+      try {
+        setPassword(CredentialProviderHelper.resolveAlias(getConf(), passwordAlias));
+        return; // short-circuit
+      } catch (IOException e) {
+        throw new RuntimeException("Unable to resolve credentials.", e);
+      }
+    }
+
     if (getBooleanProperty(props, "db.require.password", false)) {
       // The user's password was stripped out from the metastore.
       // Require that the user enter it now.
@@ -753,6 +764,11 @@ public class SqoopOptions implements Cloneable {
       return;
     }
 
+    if (getPasswordAlias() != null) { // short-circuit
+      putProperty(props, "db.password.alias", getPasswordAlias());
+      return;
+    }
+
     if (this.getConf().getBoolean(
       METASTORE_PASSWORD_KEY, METASTORE_PASSWORD_DEFAULT)) {
       // If the user specifies, we may store the password in the metastore.

[4/4] sqoop git commit: SQOOP-2264: Exclude and remove SqoopUserGuide.xml from git repository

Posted by gw...@apache.org.

SQOOP-2264: Exclude and remove SqoopUserGuide.xml from git repository

(Qian Xu via Jarek Jarcec Cecho)

(cherry picked from commit c51c7daeeda1704375ceacb97fc5d5793a6b9f71)

Conflicts:
	src/docs/user/SqoopUserGuide.xml


Project: http://git-wip-us.apache.org/repos/asf/sqoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/sqoop/commit/73973385
Tree: http://git-wip-us.apache.org/repos/asf/sqoop/tree/73973385
Diff: http://git-wip-us.apache.org/repos/asf/sqoop/diff/73973385

Branch: refs/heads/branch-1.4.6
Commit: 739733853e2daeef9e76aa1240b8b552bc881261
Parents: 5270908
Author: Jarek Jarcec Cecho <ja...@apache.org>
Authored: Wed Apr 8 20:14:19 2015 -0700
Committer: Gwen Shapira <cs...@gmail.com>
Committed: Wed Apr 8 21:54:53 2015 -0700

----------------------------------------------------------------------
 src/docs/.gitignore              |    2 +
 src/docs/Makefile                |    1 +
 src/docs/user/SqoopUserGuide.xml | 1151 ---------------------------------
 3 files changed, 3 insertions(+), 1151 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/sqoop/blob/73973385/src/docs/.gitignore
----------------------------------------------------------------------
diff --git a/src/docs/.gitignore b/src/docs/.gitignore
index 1ac4643..519ebf4 100644
--- a/src/docs/.gitignore
+++ b/src/docs/.gitignore
@@ -17,3 +17,5 @@
 /Sqoop-manpage.xml
 /sqoop.1
 /Sqoop-web.html
+/dev/SqoopDevGuide.xml
+/user/SqoopUserGuide.xml

http://git-wip-us.apache.org/repos/asf/sqoop/blob/73973385/src/docs/Makefile
----------------------------------------------------------------------
diff --git a/src/docs/Makefile b/src/docs/Makefile
index 5bdf2f8..224c023 100644
--- a/src/docs/Makefile
+++ b/src/docs/Makefile
@@ -105,6 +105,7 @@ supportfiles:
 
 clean:
 	-rm $(BUILD_DIR)/sqoop-*.1.gz
+	-rm $(BUILD_DIR)/SqoopDevGuide.html
 	-rm $(BUILD_DIR)/SqoopUserGuide.html
 	-rm user/SqoopUserGuide.html
 	-rm dev/SqoopDevGuide.html

http://git-wip-us.apache.org/repos/asf/sqoop/blob/73973385/src/docs/user/SqoopUserGuide.xml
----------------------------------------------------------------------
diff --git a/src/docs/user/SqoopUserGuide.xml b/src/docs/user/SqoopUserGuide.xml
deleted file mode 100644
index 6178426..0000000
--- a/src/docs/user/SqoopUserGuide.xml
+++ /dev/null
@@ -1,1151 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
-<?asciidoc-toc?>
-<?asciidoc-numbered?>
-
-<article lang="en">
-<articleinfo>
-    <title>Sqoop User Guide (v1.4.6)</title>
-</articleinfo>
-<screen>  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.</screen>
-<section id="_introduction">
-<title>Introduction</title>
-<simpara>Sqoop is a tool designed to transfer data between Hadoop and
-relational databases or mainframes. You can use Sqoop to import data from a
-relational database management system (RDBMS) such as MySQL or Oracle or a
-mainframe into the Hadoop Distributed File System (HDFS),
-transform the data in Hadoop MapReduce, and then export the data back
-into an RDBMS.</simpara>
-<simpara>Sqoop automates most of this process, relying on the database to
-describe the schema for the data to be imported. Sqoop uses MapReduce
-to import and export the data, which provides parallel operation as
-well as fault tolerance.</simpara>
-<simpara>This document describes how to get started using Sqoop to move data
-between databases and Hadoop or mainframe to Hadoop and provides reference
-information for the operation of the Sqoop command-line tool suite. This
-document is intended for:</simpara>
-<itemizedlist>
-<listitem>
-<simpara>
-System and application programmers
-</simpara>
-</listitem>
-<listitem>
-<simpara>
-System administrators
-</simpara>
-</listitem>
-<listitem>
-<simpara>
-Database administrators
-</simpara>
-</listitem>
-<listitem>
-<simpara>
-Data analysts
-</simpara>
-</listitem>
-<listitem>
-<simpara>
-Data engineers
-</simpara>
-</listitem>
-</itemizedlist>
-</section>
-<section id="_supported_releases">
-<title>Supported Releases</title>
-<simpara>This documentation applies to Sqoop v1.4.6.</simpara>
-</section>
-<section id="_sqoop_releases">
-<title>Sqoop Releases</title>
-<simpara>Sqoop is an open source software product of the Apache Software Foundation.</simpara>
-<simpara>Software development for Sqoop occurs at <ulink url="http://sqoop.apache.org">http://sqoop.apache.org</ulink>
-At that site you can obtain:</simpara>
-<itemizedlist>
-<listitem>
-<simpara>
-New releases of Sqoop as well as its most recent source code
-</simpara>
-</listitem>
-<listitem>
-<simpara>
-An issue tracker
-</simpara>
-</listitem>
-<listitem>
-<simpara>
-A wiki that contains Sqoop documentation
-</simpara>
-</listitem>
-</itemizedlist>
-</section>
-<section id="_prerequisites">
-<title>Prerequisites</title>
-<simpara>The following prerequisite knowledge is required for this product:</simpara>
-<itemizedlist>
-<listitem>
-<simpara>
-Basic computer technology and terminology
-</simpara>
-</listitem>
-<listitem>
-<simpara>
-Familiarity with command-line interfaces such as <literal>bash</literal>
-</simpara>
-</listitem>
-<listitem>
-<simpara>
-Relational database management systems
-</simpara>
-</listitem>
-<listitem>
-<simpara>
-Basic familiarity with the purpose and operation of Hadoop
-</simpara>
-</listitem>
-</itemizedlist>
-<simpara>Before you can use Sqoop, a release of Hadoop must be installed and
-configured. Sqoop is currently supporting 4 major Hadoop releases - 0.20,
-0.23, 1.0 and 2.0.</simpara>
-<simpara>This document assumes you are using a Linux or Linux-like environment.
-If you are using Windows, you may be able to use cygwin to accomplish
-most of the following tasks. If you are using Mac OS X, you should see
-few (if any) compatibility errors. Sqoop is predominantly operated and
-tested on Linux.</simpara>
-</section>
-<section id="_basic_usage">
-<title>Basic Usage</title>
-<simpara>With Sqoop, you can <emphasis>import</emphasis> data from a relational database system or a
-mainframe into HDFS. The input to the import process is either database table
-or mainframe datasets. For databases, Sqoop will read the table row-by-row
-into HDFS.  For mainframe datasets, Sqoop will read records from each mainframe
-dataset into HDFS. The output of this import process is a set of files
-containing a copy of the imported table or datasets.
-The import process is performed in parallel. For this reason, the
-output will be in multiple files. These files may be delimited text
-files (for example, with commas or tabs separating each field), or
-binary Avro or SequenceFiles containing serialized record data.</simpara>
-<simpara>A by-product of the import process is a generated Java class which
-can encapsulate one row of the imported table. This class is used
-during the import process by Sqoop itself. The Java source code for
-this class is also provided to you, for use in subsequent MapReduce
-processing of the data. This class can serialize and deserialize data
-to and from the SequenceFile format. It can also parse the
-delimited-text form of a record. These abilities allow you to quickly
-develop MapReduce applications that use the HDFS-stored records in
-your processing pipeline. You are also free to parse the delimiteds
-record data yourself, using any other tools you prefer.</simpara>
-<simpara>After manipulating the imported records (for example, with MapReduce
-or Hive) you may have a result data set which you can then <emphasis>export</emphasis>
-back to the relational database. Sqoop&#8217;s export process will read
-a set of delimited text files from HDFS in parallel, parse them into
-records, and insert them as new rows in a target database table, for
-consumption by external applications or users.</simpara>
-<simpara>Sqoop includes some other commands which allow you to inspect the
-database you are working with. For example, you can list the available
-database schemas (with the <literal>sqoop-list-databases</literal> tool) and tables
-within a schema (with the <literal>sqoop-list-tables</literal> tool). Sqoop also
-includes a primitive SQL execution shell (the <literal>sqoop-eval</literal> tool).</simpara>
-<simpara>Most aspects of the import, code generation, and export processes can
-be customized. For databases, you can control the specific row range or
-columns imported.  You can specify particular delimiters and escape characters
-for the file-based representation of the data, as well as the file format
-used.  You can also control the class or package names used in
-generated code. Subsequent sections of this document explain how to
-specify these and other arguments to Sqoop.</simpara>
-</section>
-<section id="_sqoop_tools">
-<title>Sqoop Tools</title>
-<simpara>Sqoop is a collection of related tools. To use Sqoop, you specify the
-tool you want to use and the arguments that control the tool.</simpara>
-<simpara>If Sqoop is compiled from its own source, you can run Sqoop without a formal
-installation process by running the <literal>bin/sqoop</literal> program. Users
-of a packaged deployment of Sqoop (such as an RPM shipped with Apache Bigtop)
-will see this program installed as <literal>/usr/bin/sqoop</literal>. The remainder of this
-documentation will refer to this program as <literal>sqoop</literal>. For example:</simpara>
-<screen>$ sqoop tool-name [tool-arguments]</screen>
-<note><simpara>The following examples that begin with a <literal>$</literal> character indicate
-that the commands must be entered at a terminal prompt (such as
-<literal>bash</literal>). The <literal>$</literal> character represents the prompt itself; you should
-not start these commands by typing a <literal>$</literal>. You can also enter commands
-inline in the text of a paragraph; for example, <literal>sqoop help</literal>. These
-examples do not show a <literal>$</literal> prefix, but you should enter them the same
-way.  Don&#8217;t confuse the <literal>$</literal> shell prompt in the examples with the <literal>$</literal>
-that precedes an environment variable name. For example, the string
-literal <literal>$HADOOP_HOME</literal> includes a "<literal>$</literal>".</simpara></note>
-<simpara>Sqoop ships with a help tool. To display a list of all available
-tools, type the following command:</simpara>
-<screen>$ sqoop help
-usage: sqoop COMMAND [ARGS]
-
-Available commands:
-  codegen            Generate code to interact with database records
-  create-hive-table  Import a table definition into Hive
-  eval               Evaluate a SQL statement and display the results
-  export             Export an HDFS directory to a database table
-  help               List available commands
-  import             Import a table from a database to HDFS
-  import-all-tables  Import tables from a database to HDFS
-  import-mainframe   Import mainframe datasets to HDFS
-  list-databases     List available databases on a server
-  list-tables        List available tables in a database
-  version            Display version information
-
-See 'sqoop help COMMAND' for information on a specific command.</screen>
-<simpara>You can display help for a specific tool by entering: <literal>sqoop help
-(tool-name)</literal>; for example, <literal>sqoop help import</literal>.</simpara>
-<simpara>You can also add the <literal>--help</literal> argument to any command: <literal>sqoop import
---help</literal>.</simpara>
-<section id="_using_command_aliases">
-<title>Using Command Aliases</title>
-<simpara>In addition to typing the <literal>sqoop (toolname)</literal> syntax, you can use alias
-scripts that specify the <literal>sqoop-(toolname)</literal> syntax. For example, the
-scripts <literal>sqoop-import</literal>, <literal>sqoop-export</literal>, etc. each select a specific
-tool.</simpara>
-</section>
-<section id="_controlling_the_hadoop_installation">
-<title>Controlling the Hadoop Installation</title>
-<simpara>You invoke Sqoop through the program launch capability provided by
-Hadoop. The <literal>sqoop</literal> command-line program is a wrapper which runs the
-<literal>bin/hadoop</literal> script shipped with Hadoop. If you have multiple
-installations of Hadoop present on your machine, you can select the
-Hadoop installation by setting the <literal>$HADOOP_COMMON_HOME</literal> and
-<literal>$HADOOP_MAPRED_HOME</literal> environment variables.</simpara>
-<simpara>For example:</simpara>
-<screen>$ HADOOP_COMMON_HOME=/path/to/some/hadoop \
-  HADOOP_MAPRED_HOME=/path/to/some/hadoop-mapreduce \
-  sqoop import --arguments...</screen>
-<simpara>or:</simpara>
-<screen>$ export HADOOP_COMMON_HOME=/some/path/to/hadoop
-$ export HADOOP_MAPRED_HOME=/some/path/to/hadoop-mapreduce
-$ sqoop import --arguments...</screen>
-<simpara>If either of these variables are not set, Sqoop will fall back to
-<literal>$HADOOP_HOME</literal>. If it is not set either, Sqoop will use the default
-installation locations for Apache Bigtop, <literal>/usr/lib/hadoop</literal> and
-<literal>/usr/lib/hadoop-mapreduce</literal>, respectively.</simpara>
-<simpara>The active Hadoop configuration is loaded from <literal>$HADOOP_HOME/conf/</literal>,
-unless the <literal>$HADOOP_CONF_DIR</literal> environment variable is set.</simpara>
-</section>
-<section id="_using_generic_and_specific_arguments">
-<title>Using Generic and Specific Arguments</title>
-<simpara>To control the operation of each Sqoop tool, you use generic and
-specific arguments.</simpara>
-<simpara>For example:</simpara>
-<screen>$ sqoop help import
-usage: sqoop import [GENERIC-ARGS] [TOOL-ARGS]
-
-Common arguments:
-   --connect &lt;jdbc-uri&gt;     Specify JDBC connect string
-   --connect-manager &lt;class-name&gt;     Specify connection manager class to use
-   --driver &lt;class-name&gt;    Manually specify JDBC driver class to use
-   --hadoop-mapred-home &lt;dir&gt;      Override $HADOOP_MAPRED_HOME
-   --help                   Print usage instructions
-   --password-file          Set path for file containing authentication password
-   -P                       Read password from console
-   --password &lt;password&gt;    Set authentication password
-   --username &lt;username&gt;    Set authentication username
-   --verbose                Print more information while working
-   --hadoop-home &lt;dir&gt;     Deprecated. Override $HADOOP_HOME
-
-[...]
-
-Generic Hadoop command-line arguments:
-(must preceed any tool-specific arguments)
-Generic options supported are
--conf &lt;configuration file&gt;     specify an application configuration file
--D &lt;property=value&gt;            use value for given property
--fs &lt;local|namenode:port&gt;      specify a namenode
--jt &lt;local|jobtracker:port&gt;    specify a job tracker
--files &lt;comma separated list of files&gt;    specify comma separated files to be copied to the map reduce cluster
--libjars &lt;comma separated list of jars&gt;    specify comma separated jar files to include in the classpath.
--archives &lt;comma separated list of archives&gt;    specify comma separated archives to be unarchived on the compute machines.
-
-The general command line syntax is
-bin/hadoop command [genericOptions] [commandOptions]</screen>
-<simpara>You must supply the generic arguments <literal>-conf</literal>, <literal>-D</literal>, and so on after the
-tool name but <emphasis role="strong">before</emphasis> any tool-specific arguments (such as
-<literal>--connect</literal>). Note that generic Hadoop arguments are preceeded by a
-single dash character (<literal>-</literal>), whereas tool-specific arguments start
-with two dashes (<literal>--</literal>), unless they are single character arguments such as <literal>-P</literal>.</simpara>
-<simpara>The <literal>-conf</literal>, <literal>-D</literal>, <literal>-fs</literal> and <literal>-jt</literal> arguments control the configuration
-and Hadoop server settings. For example, the <literal>-D mapred.job.name=&lt;job_name&gt;</literal> can
-be used to set the name of the MR job that Sqoop launches, if not specified,
-the name defaults to the jar name for the job - which is derived from the used
-table name.</simpara>
-<simpara>The <literal>-files</literal>, <literal>-libjars</literal>, and <literal>-archives</literal> arguments are not typically used with
-Sqoop, but they are included as part of Hadoop&#8217;s internal argument-parsing
-system.</simpara>
-</section>
-<section id="_using_options_files_to_pass_arguments">
-<title>Using Options Files to Pass Arguments</title>
-<simpara>When using Sqoop, the command line options that do not change from
-invocation to invocation can be put in an options file for convenience.
-An options file is a text file where each line identifies an option in
-the order that it appears otherwise on the command line. Option files
-allow specifying a single option on multiple lines by using the
-back-slash character at the end of intermediate lines. Also supported
-are comments within option files that begin with the hash character.
-Comments must be specified on a new line and may not be mixed with
-option text. All comments and empty lines are ignored when option
-files are expanded. Unless options appear as quoted strings, any
-leading or trailing spaces are ignored. Quoted strings if used must
-not extend beyond the line on which they are specified.</simpara>
-<simpara>Option files can be specified anywhere in the command line as long as
-the options within them follow the otherwise prescribed rules of
-options ordering. For instance, regardless of where the options are
-loaded from, they must follow the ordering such that generic options
-appear first, tool specific options next, finally followed by options
-that are intended to be passed to child programs.</simpara>
-<simpara>To specify an options file, simply create an options file in a
-convenient location and pass it to the command line via
-<literal>--options-file</literal> argument.</simpara>
-<simpara>Whenever an options file is specified, it is expanded on the
-command line before the tool is invoked. You can specify more than
-one option files within the same invocation if needed.</simpara>
-<simpara>For example, the following Sqoop invocation for import can
-be specified alternatively as shown below:</simpara>
-<screen>$ sqoop import --connect jdbc:mysql://localhost/db --username foo --table TEST
-
-$ sqoop --options-file /users/homer/work/import.txt --table TEST</screen>
-<simpara>where the options file <literal>/users/homer/work/import.txt</literal> contains the following:</simpara>
-<screen>import
---connect
-jdbc:mysql://localhost/db
---username
-foo</screen>
-<simpara>The options file can have empty lines and comments for readability purposes.
-So the above example would work exactly the same if the options file
-<literal>/users/homer/work/import.txt</literal> contained the following:</simpara>
-<screen>#
-# Options file for Sqoop import
-#
-
-# Specifies the tool being invoked
-import
-
-# Connect parameter and value
---connect
-jdbc:mysql://localhost/db
-
-# Username parameter and value
---username
-foo
-
-#
-# Remaining options should be specified in the command line.
-#</screen>
-</section>
-<section id="_using_tools">
-<title>Using Tools</title>
-<simpara>The following sections will describe each tool&#8217;s operation. The
-tools are listed in the most likely order you will find them useful.</simpara>
-</section>
-</section>
-<section id="_literal_sqoop_import_literal">
-<title><literal>sqoop-import</literal></title>
-<section id="_purpose">
-<title>Purpose</title>
-<simpara>The <literal>import</literal> tool imports an individual table from an RDBMS to HDFS.
-Each row from a table is represented as a separate record in HDFS.
-Records can be stored as text files (one record per line), or in
-binary representation as Avro or SequenceFiles.</simpara>
-</section>
-<section id="_syntax">
-<title>Syntax</title>
-<screen>$ sqoop import (generic-args) (import-args)
-$ sqoop-import (generic-args) (import-args)</screen>
-<simpara>While the Hadoop generic arguments must precede any import arguments,
-you can type the import arguments in any order with respect to one
-another.</simpara>
-<note><simpara>In this document, arguments are grouped into collections
-organized by function. Some collections are present in several tools
-(for example, the "common" arguments). An extended description of their
-functionality is given only on the first presentation in this
-document.</simpara></note>
-<table pgwide="0"
-frame="topbot"
-rowsep="1" colsep="1"
->
-<title>Common arguments</title>
-<tgroup cols="2">
-<colspec colwidth="248*" align="left"/>
-<colspec colwidth="230*" align="left"/>
-<thead>
-  <row>
-    <entry>
-    Argument
-    </entry>
-    <entry>
-    Description
-    </entry>
-  </row>
-</thead>
-<tbody>
-  <row>
-    <entry>
-    <literal>--connect &lt;jdbc-uri&gt;</literal>
-    </entry>
-    <entry>
-    Specify JDBC connect string
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--connection-manager &lt;class-name&gt;</literal>
-    </entry>
-    <entry>
-    Specify connection manager class to                                          use
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--driver &lt;class-name&gt;</literal>
-    </entry>
-    <entry>
-    Manually specify JDBC driver class                                          to use
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--hadoop-mapred-home &lt;dir&gt;</literal>
-    </entry>
-    <entry>
-    Override $HADOOP_MAPRED_HOME
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--help</literal>
-    </entry>
-    <entry>
-    Print usage instructions
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--password-file</literal>
-    </entry>
-    <entry>
-    Set path for a file containing the                                          authentication password
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>-P</literal>
-    </entry>
-    <entry>
-    Read password from console
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--password &lt;password&gt;</literal>
-    </entry>
-    <entry>
-    Set authentication password
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--username &lt;username&gt;</literal>
-    </entry>
-    <entry>
-    Set authentication username
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--verbose</literal>
-    </entry>
-    <entry>
-    Print more information while working
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--connection-param-file &lt;filename&gt;</literal>
-    </entry>
-    <entry>
-    Optional properties file that                                          provides connection parameters
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--relaxed-isolation</literal>
-    </entry>
-    <entry>
-    Set connection transaction isolation                                          to read uncommitted for the mappers.
-    </entry>
-  </row>
-</tbody>
-</tgroup>
-</table>
-<section id="_connecting_to_a_database_server">
-<title>Connecting to a Database Server</title>
-<simpara>Sqoop is designed to import tables from a database into HDFS. To do
-so, you must specify a <emphasis>connect string</emphasis> that describes how to connect to the
-database. The <emphasis>connect string</emphasis> is similar to a URL, and is communicated to
-Sqoop with the <literal>--connect</literal> argument. This describes the server and
-database to connect to; it may also specify the port. For example:</simpara>
-<screen>$ sqoop import --connect jdbc:mysql://database.example.com/employees</screen>
-<simpara>This string will connect to a MySQL database named <literal>employees</literal> on the
-host <literal>database.example.com</literal>. It&#8217;s important that you <emphasis role="strong">do not</emphasis> use the URL
-<literal>localhost</literal> if you intend to use Sqoop with a distributed Hadoop
-cluster. The connect string you supply will be used on TaskTracker nodes
-throughout your MapReduce cluster; if you specify the
-literal name <literal>localhost</literal>, each node will connect to a different
-database (or more likely, no database at all). Instead, you should use
-the full hostname or IP address of the database host that can be seen
-by all your remote nodes.</simpara>
-<simpara>You might need to authenticate against the database before you can
-access it. You can use the <literal>--username</literal> to supply a username to the database.
-Sqoop provides couple of different ways to supply a password,
-secure and non-secure, to the database which is detailed below.</simpara>
-<formalpara><title>Secure way of supplying password to the database</title><para>You should save the password in a file on the users home directory with 400
-permissions and specify the path to that file using the <emphasis role="strong"><literal>--password-file</literal></emphasis>
-argument, and is the preferred method of entering credentials. Sqoop will
-then read the password from the file and pass it to the MapReduce cluster
-using secure means with out exposing the password in the job configuration.
-The file containing the password can either be on the Local FS or HDFS.
-For example:</para></formalpara>
-<screen>$ sqoop import --connect jdbc:mysql://database.example.com/employees \
-    --username venkatesh --password-file ${user.home}/.password</screen>
-<warning><simpara>Sqoop will read entire content of the password file and use it as
-a password. This will include any trailing white space characters such as
-new line characters that are added by default by most of the text editors.
-You need to make sure that your password file contains only characters
-that belongs to your password. On the command line you can use command
-<literal>echo</literal> with switch <literal>-n</literal> to store password without any trailing white space
-characters. For example to store password <literal>secret</literal> you would call
-<literal>echo -n "secret" &gt; password.file</literal>.</simpara></warning>
-<simpara>Another way of supplying passwords is using the <literal>-P</literal> argument which will
-read a password from a console prompt.</simpara>
-<formalpara><title>Protecting password from preying eyes</title><para>Hadoop 2.6.0 provides an API to separate password storage from applications.
-This API is called the credential provided API and there is a new
-<literal>credential</literal>  command line tool to manage passwords and their aliases.
-The passwords are stored with their aliases in a keystore that is password
-protected.   The keystore password can be the provided to a password prompt
-on the command line, via an environment variable or defaulted to a software
-defined constant.   Please check the Hadoop documentation on the usage
-of this facility.</para></formalpara>
-<simpara>Once the password is stored using the Credential Provider facility and
-the Hadoop configuration has been suitably updated, all applications can
-optionally use the alias in place of the actual password and at runtime
-resolve the alias for the password to use.</simpara>
-<simpara>Since the keystore or similar technology used for storing the credential
-provider is shared across components, passwords for various applications,
-various database and other passwords can be securely stored in them and only
-the alias needs to be exposed in configuration files, protecting the password
-from being visible.</simpara>
-<simpara>Sqoop has been enhanced to allow usage of this funcionality if it is
-available in the underlying Hadoop version being used.   One new option
-has been introduced to provide the alias on the command line instead of the
-actual password (--password-alias).  The argument value this option is
-the alias on the storage associated with the actual password.
-Example usage is as follows:</simpara>
-<screen>$ sqoop import --connect jdbc:mysql://database.example.com/employees \
-    --username dbuser --password-alias mydb.password.alias</screen>
-<simpara>Similarly, if the command line option is not preferred, the alias can be saved
-in the file provided with --password-file option.  Along with this, the
-Sqoop configuration parameter org.apache.sqoop.credentials.loader.class
-should be set to the classname that provides the alias resolution:
-<literal>org.apache.sqoop.util.password.CredentialProviderPasswordLoader</literal></simpara>
-<simpara>Example usage is as follows (assuming .password.alias has the alias for
-the real password) :</simpara>
-<screen>$ sqoop import --connect jdbc:mysql://database.example.com/employees \
-    --username dbuser --password-file ${user.home}/.password-alias</screen>
-<warning><simpara>The <literal>--password</literal> parameter is insecure, as other users may
-be able to read your password from the command-line arguments via
-the output of programs such as <literal>ps</literal>. The <emphasis role="strong"><literal>-P</literal></emphasis> argument is the preferred
-method over using the <literal>--password</literal> argument. Credentials may still be
-transferred between nodes of the MapReduce cluster using insecure means.
-For example:</simpara></warning>
-<screen>$ sqoop import --connect jdbc:mysql://database.example.com/employees \
-    --username aaron --password 12345</screen>
-<simpara>Sqoop automatically supports several databases, including MySQL.  Connect
-strings beginning with <literal>jdbc:mysql://</literal> are handled automatically in Sqoop.  (A
-full list of databases with built-in support is provided in the "Supported
-Databases" section. For some, you may need to install the JDBC driver
-yourself.)</simpara>
-<simpara>You can use Sqoop with any other
-JDBC-compliant database. First, download the appropriate JDBC
-driver for the type of database you want to import, and install the .jar
-file in the <literal>$SQOOP_HOME/lib</literal> directory on your client machine. (This will
-be <literal>/usr/lib/sqoop/lib</literal> if you installed from an RPM or Debian package.)
-Each driver <literal>.jar</literal> file also has a specific driver class which defines
-the entry-point to the driver. For example, MySQL&#8217;s Connector/J library has
-a driver class of <literal>com.mysql.jdbc.Driver</literal>. Refer to your database
-vendor-specific documentation to determine the main driver class.
-This class must be provided as an argument to Sqoop with <literal>--driver</literal>.</simpara>
-<simpara>For example, to connect to a SQLServer database, first download the driver from
-microsoft.com and install it in your Sqoop lib path.</simpara>
-<simpara>Then run Sqoop. For example:</simpara>
-<screen>$ sqoop import --driver com.microsoft.jdbc.sqlserver.SQLServerDriver \
-    --connect &lt;connect-string&gt; ...</screen>
-<simpara>When connecting to a database using JDBC, you can optionally specify extra
-JDBC parameters via a property file using the option
-<literal>--connection-param-file</literal>. The contents of this file are parsed as standard
-Java properties and passed into the driver while creating a connection.</simpara>
-<note><simpara>The parameters specified via the optional property file are only
-applicable to JDBC connections. Any fastpath connectors that use connections
-other than JDBC will ignore these parameters.</simpara></note>
-<table pgwide="0"
-frame="topbot"
-rowsep="1" colsep="1"
->
-<title>Validation arguments <link linkend="validation">More Details</link></title>
-<tgroup cols="2">
-<colspec colwidth="267*" align="left"/>
-<colspec colwidth="230*" align="left"/>
-<thead>
-  <row>
-    <entry>
-    Argument
-    </entry>
-    <entry>
-    Description
-    </entry>
-  </row>
-</thead>
-<tbody>
-  <row>
-    <entry>
-    <literal>--validate</literal>
-    </entry>
-    <entry>
-    Enable validation of data copied,                                             supports single table copy only.
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--validator &lt;class-name&gt;</literal>
-    </entry>
-    <entry>
-    Specify validator class to use.
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--validation-threshold &lt;class-name&gt;</literal>
-    </entry>
-    <entry>
-    Specify validation threshold class                                             to use.
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--validation-failurehandler &lt;class-name&gt;</literal>
-    </entry>
-    <entry>
-    Specify validation failure                                             handler class to use.
-    </entry>
-  </row>
-</tbody>
-</tgroup>
-</table>
-<table pgwide="0"
-frame="topbot"
-rowsep="1" colsep="1"
->
-<title>Import control arguments:</title>
-<tgroup cols="2">
-<colspec colwidth="206*" align="left"/>
-<colspec colwidth="236*" align="left"/>
-<thead>
-  <row>
-    <entry>
-    Argument
-    </entry>
-    <entry>
-    Description
-    </entry>
-  </row>
-</thead>
-<tbody>
-  <row>
-    <entry>
-    <literal>--append</literal>
-    </entry>
-    <entry>
-    Append data to an existing dataset                                  in HDFS
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--as-avrodatafile</literal>
-    </entry>
-    <entry>
-    Imports data to Avro Data Files
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--as-sequencefile</literal>
-    </entry>
-    <entry>
-    Imports data to SequenceFiles
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--as-textfile</literal>
-    </entry>
-    <entry>
-    Imports data as plain text (default)
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--as-parquetfile</literal>
-    </entry>
-    <entry>
-    Imports data to Parquet Files
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--boundary-query &lt;statement&gt;</literal>
-    </entry>
-    <entry>
-    Boundary query to use for creating splits
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--columns &lt;col,col,col&#8230;&gt;</literal>
-    </entry>
-    <entry>
-    Columns to import from table
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--delete-target-dir</literal>
-    </entry>
-    <entry>
-    Delete the import target directory                                  if it exists
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--direct</literal>
-    </entry>
-    <entry>
-    Use direct connector if exists for the database
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--fetch-size &lt;n&gt;</literal>
-    </entry>
-    <entry>
-    Number of entries to read from database                                  at once.
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--inline-lob-limit &lt;n&gt;</literal>
-    </entry>
-    <entry>
-    Set the maximum size for an inline LOB
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>-m,--num-mappers &lt;n&gt;</literal>
-    </entry>
-    <entry>
-    Use <emphasis>n</emphasis> map tasks to import in parallel
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>-e,--query &lt;statement&gt;</literal>
-    </entry>
-    <entry>
-    Import the results of <emphasis><literal>statement</literal></emphasis>.
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--split-by &lt;column-name&gt;</literal>
-    </entry>
-    <entry>
-    Column of the table used to split work                                  units
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--table &lt;table-name&gt;</literal>
-    </entry>
-    <entry>
-    Table to read
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--target-dir &lt;dir&gt;</literal>
-    </entry>
-    <entry>
-    HDFS destination dir
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--warehouse-dir &lt;dir&gt;</literal>
-    </entry>
-    <entry>
-    HDFS parent for table destination
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--where &lt;where clause&gt;</literal>
-    </entry>
-    <entry>
-    WHERE clause to use during import
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>-z,--compress</literal>
-    </entry>
-    <entry>
-    Enable compression
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--compression-codec &lt;c&gt;</literal>
-    </entry>
-    <entry>
-    Use Hadoop codec (default gzip)
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--null-string &lt;null-string&gt;</literal>
-    </entry>
-    <entry>
-    The string to be written for a null                                  value for string columns
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--null-non-string &lt;null-string&gt;</literal>
-    </entry>
-    <entry>
-    The string to be written for a null                                  value for non-string columns
-    </entry>
-  </row>
-</tbody>
-</tgroup>
-</table>
-<simpara>The <literal>--null-string</literal> and <literal>--null-non-string</literal> arguments are optional.\
-If not specified, then the string "null" will be used.</simpara>
-</section>
-<section id="_selecting_the_data_to_import">
-<title>Selecting the Data to Import</title>
-<simpara>Sqoop typically imports data in a table-centric fashion. Use the
-<literal>--table</literal> argument to select the table to import. For example, <literal>--table
-employees</literal>. This argument can also identify a <literal>VIEW</literal> or other table-like
-entity in a database.</simpara>
-<simpara>By default, all columns within a table are selected for import.
-Imported data is written to HDFS in its "natural order;" that is, a
-table containing columns A, B, and C result in an import of data such
-as:</simpara>
-<screen>A1,B1,C1
-A2,B2,C2
-...</screen>
-<simpara>You can select a subset of columns and control their ordering by using
-the <literal>--columns</literal> argument. This should include a comma-delimited list
-of columns to import. For example: <literal>--columns "name,employee_id,jobtitle"</literal>.</simpara>
-<simpara>You can control which rows are imported by adding a SQL <literal>WHERE</literal> clause
-to the import statement. By default, Sqoop generates statements of the
-form <literal>SELECT &lt;column list&gt; FROM &lt;table name&gt;</literal>. You can append a
-<literal>WHERE</literal> clause to this with the <literal>--where</literal> argument. For example: <literal>--where
-"id &gt; 400"</literal>. Only rows where the <literal>id</literal> column has a value greater than
-400 will be imported.</simpara>
-<simpara>By default sqoop will use query <literal>select min(&lt;split-by&gt;), max(&lt;split-by&gt;) from
-&lt;table name&gt;</literal> to find out boundaries for creating splits. In some cases this query
-is not the most optimal so you can specify any arbitrary query returning two
-numeric columns using <literal>--boundary-query</literal> argument.</simpara>
-</section>
-<section id="_free_form_query_imports">
-<title>Free-form Query Imports</title>
-<simpara>Sqoop can also import the result set of an arbitrary SQL query. Instead of
-using the <literal>--table</literal>, <literal>--columns</literal> and <literal>--where</literal> arguments, you can specify
-a SQL statement with the <literal>--query</literal> argument.</simpara>
-<simpara>When importing a free-form query, you must specify a destination directory
-with <literal>--target-dir</literal>.</simpara>
-<simpara>If you want to import the results of a query in parallel, then each map task
-will need to execute a copy of the query, with results partitioned by bounding
-conditions inferred by Sqoop. Your query must include the token <literal>$CONDITIONS</literal>
-which each Sqoop process will replace with a unique condition expression.
-You must also select a splitting column with <literal>--split-by</literal>.</simpara>
-<simpara>For example:</simpara>
-<screen>$ sqoop import \
-  --query 'SELECT a.*, b.* FROM a JOIN b on (a.id == b.id) WHERE $CONDITIONS' \
-  --split-by a.id --target-dir /user/foo/joinresults</screen>
-<simpara>Alternately, the query can be executed once and imported serially, by
-specifying a single map task with <literal>-m 1</literal>:</simpara>
-<screen>$ sqoop import \
-  --query 'SELECT a.*, b.* FROM a JOIN b on (a.id == b.id) WHERE $CONDITIONS' \
-  -m 1 --target-dir /user/foo/joinresults</screen>
-<note><simpara>If you are issuing the query wrapped with double quotes ("),
-you will have to use <literal>\$CONDITIONS</literal> instead of just <literal>$CONDITIONS</literal>
-to disallow your shell from treating it as a shell variable.
-For example, a double quoted query may look like:
-<literal>"SELECT * FROM x WHERE a='foo' AND \$CONDITIONS"</literal></simpara></note>
-<note><simpara>The facility of using free-form query in the current version of Sqoop
-is limited to simple queries where there are no ambiguous projections and
-no <literal>OR</literal> conditions in the <literal>WHERE</literal> clause. Use of complex queries such as
-queries that have sub-queries or joins leading to ambiguous projections can
-lead to unexpected results.</simpara></note>
-</section>
-<section id="_controlling_parallelism">
-<title>Controlling Parallelism</title>
-<simpara>Sqoop imports data in parallel from most database sources. You can
-specify the number
-of map tasks (parallel processes) to use to perform the import by
-using the <literal>-m</literal> or <literal>--num-mappers</literal> argument. Each of these arguments
-takes an integer value which corresponds to the degree of parallelism
-to employ. By default, four tasks are used. Some databases may see
-improved performance by increasing this value to 8 or 16. Do not
-increase the degree of parallelism greater than that available within
-your MapReduce cluster; tasks will run serially and will likely
-increase the amount of time required to perform the import. Likewise,
-do not increase the degree of parallism higher than that which your
-database can reasonably support. Connecting 100 concurrent clients to
-your database may increase the load on the database server to a point
-where performance suffers as a result.</simpara>
-<simpara>When performing parallel imports, Sqoop needs a criterion by which it
-can split the workload. Sqoop uses a <emphasis>splitting column</emphasis> to split the
-workload. By default, Sqoop will identify the primary key column (if
-present) in a table and use it as the splitting column. The low and
-high values for the splitting column are retrieved from the database,
-and the map tasks operate on evenly-sized components of the total
-range. For example, if you had a table with a primary key column of
-<literal>id</literal> whose minimum value was 0 and maximum value was 1000, and Sqoop
-was directed to use 4 tasks, Sqoop would run four processes which each
-execute SQL statements of the form <literal>SELECT * FROM sometable WHERE id
-&gt;= lo AND id &lt; hi</literal>, with <literal>(lo, hi)</literal> set to (0, 250), (250, 500),
-(500, 750), and (750, 1001) in the different tasks.</simpara>
-<simpara>If the actual values for the primary key are not uniformly distributed
-across its range, then this can result in unbalanced tasks. You should
-explicitly choose a different column with the <literal>--split-by</literal> argument.
-For example, <literal>--split-by employee_id</literal>. Sqoop cannot currently split on
-multi-column indices. If your table has no index column, or has a
-multi-column key, then you must also manually choose a splitting
-column.</simpara>
-</section>
-<section id="_controlling_distributed_cache">
-<title>Controlling Distributed Cache</title>
-<simpara>Sqoop will copy the jars in $SQOOP_HOME/lib folder to job cache every
-time when start a Sqoop job. When launched by Oozie this is unnecessary
-since Oozie use its own Sqoop share lib which keeps Sqoop dependencies
-in the distributed cache. Oozie will do the localization on each
-worker node for the Sqoop dependencies only once during the first Sqoop
-job and reuse the jars on worker node for subsquencial jobs. Using
-option <literal>--skip-dist-cache</literal> in Sqoop command when launched by Oozie will
-skip the step which Sqoop copies its dependencies to job cache and save
-massive I/O.</simpara>
-</section>
-<section id="_controlling_the_import_process">
-<title>Controlling the Import Process</title>
-<simpara>By default, the import process will use JDBC which provides a
-reasonable cross-vendor import channel. Some databases can perform
-imports in a more high-performance fashion by using database-specific
-data movement tools. For example, MySQL provides the <literal>mysqldump</literal> tool
-which can export data from MySQL to other systems very quickly. By
-supplying the <literal>--direct</literal> argument, you are specifying that Sqoop
-should attempt the direct import channel. This channel may be
-higher performance than using JDBC.</simpara>
-<simpara>Details about use of direct mode with each specific RDBMS, installation requirements, available
-options and limitations can be found in <xref linkend="connectors"/>.</simpara>
-<simpara>By default, Sqoop will import a table named <literal>foo</literal> to a directory named
-<literal>foo</literal> inside your home directory in HDFS. For example, if your
-username is <literal>someuser</literal>, then the import tool will write to
-<literal>/user/someuser/foo/(files)</literal>. You can adjust the parent directory of
-the import with the <literal>--warehouse-dir</literal> argument. For example:</simpara>
-<screen>$ sqoop import --connnect &lt;connect-str&gt; --table foo --warehouse-dir /shared \
-    ...</screen>
-<simpara>This command would write to a set of files in the <literal>/shared/foo/</literal> directory.</simpara>
-<simpara>You can also explicitly choose the target directory, like so:</simpara>
-<screen>$ sqoop import --connnect &lt;connect-str&gt; --table foo --target-dir /dest \
-    ...</screen>
-<simpara>This will import the files into the <literal>/dest</literal> directory. <literal>--target-dir</literal> is
-incompatible with <literal>--warehouse-dir</literal>.</simpara>
-<simpara>When using direct mode, you can specify additional arguments which
-should be passed to the underlying tool. If the argument
-<literal>--</literal> is given on the command-line, then subsequent arguments are sent
-directly to the underlying tool. For example, the following adjusts
-the character set used by <literal>mysqldump</literal>:</simpara>
-<screen>$ sqoop import --connect jdbc:mysql://server.foo.com/db --table bar \
-    --direct -- --default-character-set=latin1</screen>
-<simpara>By default, imports go to a new target location. If the destination directory
-already exists in HDFS, Sqoop will refuse to import and overwrite that
-directory&#8217;s contents. If you use the <literal>--append</literal> argument, Sqoop will import
-data to a temporary directory and then rename the files into the normal
-target directory in a manner that does not conflict with existing filenames
-in that directory.</simpara>
-</section>
-<section id="_controlling_transaction_isolation">
-<title>Controlling transaction isolation</title>
-<simpara>By default, Sqoop uses the read committed transaction isolation in the mappers
-to import data.  This may not be the ideal in all ETL workflows and it may
-desired to reduce the isolation guarantees.   The <literal>--relaxed-isolation</literal> option
-can be used to instruct Sqoop to use read uncommitted isolation level.</simpara>
-<simpara>The <literal>read-uncommitted</literal> isolation level is not supported on all databases
-(for example, Oracle), so specifying the option <literal>--relaxed-isolation</literal>
-may not be supported on all databases.</simpara>
-</section>
-<section id="_controlling_type_mapping">
-<title>Controlling type mapping</title>
-<simpara>Sqoop is preconfigured to map most SQL types to appropriate Java or Hive
-representatives. However the default mapping might not be suitable for
-everyone and might be overridden by <literal>--map-column-java</literal> (for changing
-mapping to Java) or <literal>--map-column-hive</literal> (for changing Hive mapping).</simpara>
-<table pgwide="0"
-frame="topbot"
-rowsep="1" colsep="1"
->
-<title>Parameters for overriding mapping</title>
-<tgroup cols="2">
-<colspec colwidth="206*" align="left"/>
-<colspec colwidth="236*" align="left"/>
-<thead>
-  <row>
-    <entry>
-    Argument
-    </entry>
-    <entry>
-    Description
-    </entry>
-  </row>
-</thead>
-<tbody>
-  <row>
-    <entry>
-    <literal>--map-column-java &lt;mapping&gt;</literal>
-    </entry>
-    <entry>
-    Override mapping from SQL to Java type                                  for configured columns.
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--map-column-hive &lt;mapping&gt;</literal>
-    </entry>
-    <entry>
-    Override mapping from SQL to Hive type                                  for configured columns.
-    </entry>
-  </row>
-</tbody>
-</tgroup>
-</table>
-<simpara>Sqoop is expecting comma separated list of mapping in form &lt;name of column&gt;=&lt;new type&gt;. For example:</simpara>
-<screen>$ sqoop import ... --map-column-java id=String,value=Integer</screen>
-<simpara>Sqoop will rise exception in case that some configured mapping will not be used.</simpara>
-</section>
-<section id="_incremental_imports">
-<title>Incremental Imports</title>
-<simpara>Sqoop provides an incremental import mode which can be used to retrieve
-only rows newer than some previously-imported set of rows.</simpara>
-<simpara>The following arguments control incremental imports:</simpara>
-<table pgwide="0"
-frame="topbot"
-rowsep="1" colsep="1"
->
-<title>Incremental import arguments:</title>
-<tgroup cols="2">
-<colspec colwidth="182*" align="left"/>
-<colspec colwidth="236*" align="left"/>
-<thead>
-  <row>
-    <entry>
-    Argument
-    </entry>
-    <entry>
-    Description
-    </entry>
-  </row>
-</thead>
-<tbody>
-  <row>
-    <entry>
-    <literal>--check-column (col)</literal>
-    </entry>
-    <entry>
-    Specifies the column to be examined                               when determining which rows to import.                              (the column should not be of type                               CHAR/NCHAR/VARCHAR/VARNCHAR/                              LONGVARCHAR/LONGNVARCHAR)
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--incremental (mode)</literal>
-    </entry>
-    <entry>
-    Specifies how Sqoop determines which                               rows are new. Legal values for <literal>mode</literal>                              include <literal>append</literal> and <literal>lastmodified</literal>.
-    </entry>
-  </row>
-  <row>
-    <entry>
-    <literal>--last-value (value)</literal>
-    </entry>
-    <entry>
-    Specifies the maximum value of the                               check column from the previous import.
-    </entry>
-  </row>
-</tbody>
-</tgroup>
-</table>
-<simpara>Sqoop supports two types of incremental imports: <literal>append</literal> and <literal>lastmodified</literal>.
-You can use the <literal>--incremental</literal> argument to specify the type of incremental
-import to perform.</simpara>
-<simpara>You should specify <literal>append</literal> mode when importing a table where new rows are
-continually being added with increasing row id values. You specify the column
-containing the row&#8217;s id with <literal>--check-column</literal>. Sqoop imports rows where the
-check column has a value greater than the one specified with <literal>--last-value</literal>.</simpara>
-<simpara>An alternate table update strategy supported by Sqoop is called <literal>lastmodified</literal>
-mode. You should use this when rows of the source table may be updated, and
-each such update will set the value of a last-modified column to the current
-timestamp.  Rows where the check column holds a timestamp more recent than the
-timestamp specified with <literal>--last-value</literal> are imported.</simpara>
-<simpara>At the end of an incremental import, the value which should be specified as
-<literal>--last-value</literal> for a subsequent import is printed to the screen. When running
-a subsequent import, you should specify <literal>--last-value</literal> in this way to ensure
-you import only the new or updated data. This is handled automatically by
-creating an incremental import as a saved job, which is the preferred
-mechanism for performing a recurring incremental import. See the section on
-saved jobs later in this document for more information.</simpara>
-</section>
-<section id="_file_formats">
-<title>File Formats</title>
-<simpara>You can import data in one of two file formats: delimited text or
-SequenceFiles.</simpara>
-<simpara>Delimited text is the default import format. You can also specify it
-explicitly by using the <literal>--as-textfile</literal> argument. This argument will write
-string-based representations of each record to the output files, with
-delimiter characters between individual columns and rows. These
-delimiters may be commas, tabs, or other characters. (The delimiters
-can be selected; see "Output line formatting arguments.") The
-following is the results of an example text-based import:</simpara>
-<screen>1,here is a message,2010-05-01
-2,happy new year!,2010-01-01
-3,another message,2009-11-12</screen>
-<simpara>Delimited text is appropriate for most non-binary data types. It also
-readily supports further manipulation by other tools, such as Hive.</simpara>

[3/4] sqoop git commit: SQOOP-2252: Add default to Avro Schema

Posted by gw...@apache.org.

SQOOP-2252: Add default to Avro Schema

(Andrew Stevenson via Jarek Jarcec Cecho)

(cherry picked from commit 8050107ecf0df6d8dc3869f6c82fe84ee098f08a)


Project: http://git-wip-us.apache.org/repos/asf/sqoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/sqoop/commit/5270908b
Tree: http://git-wip-us.apache.org/repos/asf/sqoop/tree/5270908b
Diff: http://git-wip-us.apache.org/repos/asf/sqoop/diff/5270908b

Branch: refs/heads/branch-1.4.6
Commit: 5270908b3e3c0429855ac316abc7cec52fc3dbf3
Parents: b87b994
Author: Jarek Jarcec Cecho <ja...@apache.org>
Authored: Wed Apr 8 17:57:57 2015 -0700
Committer: Gwen Shapira <cs...@gmail.com>
Committed: Wed Apr 8 21:53:41 2015 -0700

----------------------------------------------------------------------
 src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java | 6 ++++--
 src/test/com/cloudera/sqoop/TestAvroExport.java        | 2 +-
 src/test/com/cloudera/sqoop/TestAvroImport.java        | 4 ++--
 src/test/com/cloudera/sqoop/TestParquetExport.java     | 2 +-
 src/test/com/cloudera/sqoop/TestParquetImport.java     | 4 ++--
 5 files changed, 10 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/sqoop/blob/5270908b/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java b/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java
index a73aa13..ed8e8b1 100644
--- a/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java
+++ b/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java
@@ -34,6 +34,8 @@ import com.cloudera.sqoop.SqoopOptions;
 import com.cloudera.sqoop.manager.ConnManager;
 import org.apache.sqoop.avro.AvroUtil;
 
+import org.codehaus.jackson.node.NullNode;
+
 /**
  * Creates an Avro schema to represent a table from a database.
  */
@@ -64,7 +66,7 @@ public class AvroSchemaGenerator {
       String cleanedCol = AvroUtil.toAvroIdentifier(ClassWriter.toJavaIdentifier(columnName));
       int sqlType = columnTypes.get(columnName);
       Schema avroSchema = toAvroSchema(sqlType, columnName);
-      Field field = new Field(cleanedCol, avroSchema, null, null);
+      Field field = new Field(cleanedCol, avroSchema, null,  NullNode.getInstance());
       field.addProp("columnName", columnName);
       field.addProp("sqlType", Integer.toString(sqlType));
       fields.add(field);
@@ -92,8 +94,8 @@ public class AvroSchemaGenerator {
    */
   public Schema toAvroSchema(int sqlType, String columnName) {
     List<Schema> childSchemas = new ArrayList<Schema>();
-    childSchemas.add(Schema.create(toAvroType(columnName, sqlType)));
     childSchemas.add(Schema.create(Schema.Type.NULL));
+    childSchemas.add(Schema.create(toAvroType(columnName, sqlType)));
     return Schema.createUnion(childSchemas);
   }
 

http://git-wip-us.apache.org/repos/asf/sqoop/blob/5270908b/src/test/com/cloudera/sqoop/TestAvroExport.java
----------------------------------------------------------------------
diff --git a/src/test/com/cloudera/sqoop/TestAvroExport.java b/src/test/com/cloudera/sqoop/TestAvroExport.java
index db776e6..663828c 100644
--- a/src/test/com/cloudera/sqoop/TestAvroExport.java
+++ b/src/test/com/cloudera/sqoop/TestAvroExport.java
@@ -338,8 +338,8 @@ public class TestAvroExport extends ExportJobTestCase {
     final int TOTAL_RECORDS = 1 * 10;
 
     List<Schema> childSchemas = new ArrayList<Schema>();
-    childSchemas.add(Schema.create(Schema.Type.STRING));
     childSchemas.add(Schema.create(Schema.Type.NULL));
+    childSchemas.add(Schema.create(Schema.Type.STRING));
     Schema schema =  Schema.createUnion(childSchemas);
     ColumnGenerator gen0 = colGenerator(null, schema, null, "VARCHAR(64)");
     ColumnGenerator gen1 = colGenerator("s", schema, "s", "VARCHAR(64)");

http://git-wip-us.apache.org/repos/asf/sqoop/blob/5270908b/src/test/com/cloudera/sqoop/TestAvroImport.java
----------------------------------------------------------------------
diff --git a/src/test/com/cloudera/sqoop/TestAvroImport.java b/src/test/com/cloudera/sqoop/TestAvroImport.java
index 08b8aa9..af83ce0 100644
--- a/src/test/com/cloudera/sqoop/TestAvroImport.java
+++ b/src/test/com/cloudera/sqoop/TestAvroImport.java
@@ -230,8 +230,8 @@ public class TestAvroImport extends ImportJobTestCase {
   private void checkField(Field field, String name, Type type) {
     assertEquals(name, field.name());
     assertEquals(Schema.Type.UNION, field.schema().getType());
-    assertEquals(type, field.schema().getTypes().get(0).getType());
-    assertEquals(Schema.Type.NULL, field.schema().getTypes().get(1).getType());
+    assertEquals(Schema.Type.NULL, field.schema().getTypes().get(0).getType());
+    assertEquals(type, field.schema().getTypes().get(1).getType());
   }
 
   public void testNullableAvroImport() throws IOException, SQLException {

http://git-wip-us.apache.org/repos/asf/sqoop/blob/5270908b/src/test/com/cloudera/sqoop/TestParquetExport.java
----------------------------------------------------------------------
diff --git a/src/test/com/cloudera/sqoop/TestParquetExport.java b/src/test/com/cloudera/sqoop/TestParquetExport.java
index 9065daf..86b40fb 100644
--- a/src/test/com/cloudera/sqoop/TestParquetExport.java
+++ b/src/test/com/cloudera/sqoop/TestParquetExport.java
@@ -320,8 +320,8 @@ public class TestParquetExport extends ExportJobTestCase {
     final int TOTAL_RECORDS = 1 * 10;
 
     List<Schema> childSchemas = new ArrayList<Schema>();
-    childSchemas.add(Schema.create(Schema.Type.STRING));
     childSchemas.add(Schema.create(Schema.Type.NULL));
+    childSchemas.add(Schema.create(Schema.Type.STRING));
     Schema schema =  Schema.createUnion(childSchemas);
     ColumnGenerator gen0 = colGenerator(null, schema, null, "VARCHAR(64)");
     ColumnGenerator gen1 = colGenerator("s", schema, "s", "VARCHAR(64)");

http://git-wip-us.apache.org/repos/asf/sqoop/blob/5270908b/src/test/com/cloudera/sqoop/TestParquetImport.java
----------------------------------------------------------------------
diff --git a/src/test/com/cloudera/sqoop/TestParquetImport.java b/src/test/com/cloudera/sqoop/TestParquetImport.java
index 21a2fec..ae5427c 100644
--- a/src/test/com/cloudera/sqoop/TestParquetImport.java
+++ b/src/test/com/cloudera/sqoop/TestParquetImport.java
@@ -211,8 +211,8 @@ public class TestParquetImport extends ImportJobTestCase {
   private void checkField(Field field, String name, Type type) {
     assertEquals(name, field.name());
     assertEquals(Type.UNION, field.schema().getType());
-    assertEquals(type, field.schema().getTypes().get(0).getType());
-    assertEquals(Type.NULL, field.schema().getTypes().get(1).getType());
+    assertEquals(Type.NULL, field.schema().getTypes().get(0).getType());
+    assertEquals(type, field.schema().getTypes().get(1).getType());
   }
 
 }

[2/4] sqoop git commit: SQOOP-2286: Ensure Sqoop generates valid avro column names

Posted by gw...@apache.org.

SQOOP-2286: Ensure Sqoop generates valid avro column names

(Abraham Elmahrek via Gwen Shapira)

(cherry picked from commit baf51351281842bd660572fcc05c89d6407913c5)


Project: http://git-wip-us.apache.org/repos/asf/sqoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/sqoop/commit/b87b994c
Tree: http://git-wip-us.apache.org/repos/asf/sqoop/tree/b87b994c
Diff: http://git-wip-us.apache.org/repos/asf/sqoop/diff/b87b994c

Branch: refs/heads/branch-1.4.6
Commit: b87b994c571cc509d9bca060356767b5621cf0fe
Parents: 621be4a
Author: Gwen Shapira <cs...@gmail.com>
Authored: Tue Apr 7 19:39:02 2015 -0700
Committer: Gwen Shapira <cs...@gmail.com>
Committed: Wed Apr 8 21:53:31 2015 -0700

----------------------------------------------------------------------
 src/java/org/apache/sqoop/avro/AvroUtil.java    | 23 +++++++++++++++++++-
 .../apache/sqoop/orm/AvroSchemaGenerator.java   |  3 ++-
 src/test/com/cloudera/sqoop/TestAvroImport.java | 21 ++++++++++++++++++
 3 files changed, 45 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/sqoop/blob/b87b994c/src/java/org/apache/sqoop/avro/AvroUtil.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/sqoop/avro/AvroUtil.java b/src/java/org/apache/sqoop/avro/AvroUtil.java
index 2fdf263..ee3cf62 100644
--- a/src/java/org/apache/sqoop/avro/AvroUtil.java
+++ b/src/java/org/apache/sqoop/avro/AvroUtil.java
@@ -24,6 +24,7 @@ import org.apache.avro.generic.GenericRecord;
 import org.apache.hadoop.io.BytesWritable;
 import org.apache.sqoop.lib.BlobRef;
 import org.apache.sqoop.lib.ClobRef;
+import org.apache.sqoop.orm.ClassWriter;
 
 import java.math.BigDecimal;
 import java.nio.ByteBuffer;
@@ -72,6 +73,25 @@ public final class AvroUtil {
   }
 
   /**
+   * Convert Column name into Avro column name.
+   */
+  public static String toAvroColumn(String column) {
+    return toAvroIdentifier(column);
+  }
+
+  /**
+   * Format candidate to avro specifics
+   */
+  public static String toAvroIdentifier(String candidate) {
+    String formattedCandidate = candidate.replaceAll("\\W+", "");
+    if (formattedCandidate.substring(0,1).matches("[a-zA-Z_]")) {
+      return formattedCandidate;
+    } else {
+      return "AVRO_" + formattedCandidate;
+    }
+  }
+
+  /**
    * Manipulate a GenericRecord instance.
    */
   public static GenericRecord toGenericRecord(Map<String, Object> fieldMap,
@@ -79,7 +99,8 @@ public final class AvroUtil {
     GenericRecord record = new GenericData.Record(schema);
     for (Map.Entry<String, Object> entry : fieldMap.entrySet()) {
       Object avroObject = toAvro(entry.getValue(), bigDecimalFormatString);
-      record.put(entry.getKey(), avroObject);
+      String avroColumn = toAvroColumn(entry.getKey());
+      record.put(avroColumn, avroObject);
     }
     return record;
   }

http://git-wip-us.apache.org/repos/asf/sqoop/blob/b87b994c/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java b/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java
index 3c913a8..a73aa13 100644
--- a/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java
+++ b/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java
@@ -32,6 +32,7 @@ import org.apache.commons.logging.LogFactory;
 
 import com.cloudera.sqoop.SqoopOptions;
 import com.cloudera.sqoop.manager.ConnManager;
+import org.apache.sqoop.avro.AvroUtil;
 
 /**
  * Creates an Avro schema to represent a table from a database.
@@ -60,7 +61,7 @@ public class AvroSchemaGenerator {
 
     List<Field> fields = new ArrayList<Field>();
     for (String columnName : columnNames) {
-      String cleanedCol = ClassWriter.toJavaIdentifier(columnName);
+      String cleanedCol = AvroUtil.toAvroIdentifier(ClassWriter.toJavaIdentifier(columnName));
       int sqlType = columnTypes.get(columnName);
       Schema avroSchema = toAvroSchema(sqlType, columnName);
       Field field = new Field(cleanedCol, avroSchema, null, null);

http://git-wip-us.apache.org/repos/asf/sqoop/blob/b87b994c/src/test/com/cloudera/sqoop/TestAvroImport.java
----------------------------------------------------------------------
diff --git a/src/test/com/cloudera/sqoop/TestAvroImport.java b/src/test/com/cloudera/sqoop/TestAvroImport.java
index dd051f3..08b8aa9 100644
--- a/src/test/com/cloudera/sqoop/TestAvroImport.java
+++ b/src/test/com/cloudera/sqoop/TestAvroImport.java
@@ -206,6 +206,27 @@ public class TestAvroImport extends ImportJobTestCase {
     assertEquals("__NAME", 1987, record1.get("__NAME"));
   }
 
+  public void testNonstandardCharactersInColumnName() throws IOException {
+    String [] names = { "avroå1" };
+    String [] types = { "INT" };
+    String [] vals = { "1987" };
+    createTableWithColTypesAndNames(names, types, vals);
+
+    runImport(getOutputArgv(true, null));
+
+    Path outputFile = new Path(getTablePath(), "part-m-00000.avro");
+    DataFileReader<GenericRecord> reader = read(outputFile);
+    Schema schema = reader.getSchema();
+    assertEquals(Schema.Type.RECORD, schema.getType());
+    List<Field> fields = schema.getFields();
+    assertEquals(types.length, fields.size());
+
+    checkField(fields.get(0), "AVRO1", Type.INT);
+
+    GenericRecord record1 = reader.next();
+    assertEquals("AVRO1", 1987, record1.get("AVRO1"));
+  }
+
   private void checkField(Field field, String name, Type type) {
     assertEquals(name, field.name());
     assertEquals(Schema.Type.UNION, field.schema().getType());