You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@sqoop.apache.org by ar...@apache.org on 2011/08/25 06:52:09 UTC
svn commit: r1161382 - in /incubator/sqoop/trunk: src/docs/user/
src/java/com/cloudera/sqoop/ src/java/com/cloudera/sqoop/lib/
src/java/com/cloudera/sqoop/orm/ src/java/com/cloudera/sqoop/tool/
src/test/com/cloudera/sqoop/hive/ testdata/hive/scripts/
Author: arvind
Date: Thu Aug 25 04:52:09 2011
New Revision: 1161382
URL: http://svn.apache.org/viewvc?rev=1161382&view=rev
Log:
SQOOP-319. Support for replacing Hive delimiters.
(Joey Echeverria via Arvind Prabhakar)
Added:
incubator/sqoop/trunk/testdata/hive/scripts/fieldWithNewlineReplacementImport.q
Modified:
incubator/sqoop/trunk/src/docs/user/hive-args.txt
incubator/sqoop/trunk/src/docs/user/hive.txt
incubator/sqoop/trunk/src/java/com/cloudera/sqoop/SqoopOptions.java
incubator/sqoop/trunk/src/java/com/cloudera/sqoop/lib/FieldFormatter.java
incubator/sqoop/trunk/src/java/com/cloudera/sqoop/orm/ClassWriter.java
incubator/sqoop/trunk/src/java/com/cloudera/sqoop/tool/BaseSqoopTool.java
incubator/sqoop/trunk/src/java/com/cloudera/sqoop/tool/ImportTool.java
incubator/sqoop/trunk/src/test/com/cloudera/sqoop/hive/TestHiveImport.java
Modified: incubator/sqoop/trunk/src/docs/user/hive-args.txt
URL: http://svn.apache.org/viewvc/incubator/sqoop/trunk/src/docs/user/hive-args.txt?rev=1161382&r1=1161381&r2=1161382&view=diff
==============================================================================
--- incubator/sqoop/trunk/src/docs/user/hive-args.txt (original)
+++ incubator/sqoop/trunk/src/docs/user/hive-args.txt Thu Aug 25 04:52:09 2011
@@ -32,6 +32,8 @@ Argument Descriptio
to Hive.
+\--hive-drop-import-delims+ Drops '\n', '\r', and '\01' from string\
fields when importing to Hive.
++\--hive-delims-replacement+ Replace '\n', '\r', and '\01' from string\
+ fields with user defined string when importing to Hive.
+\--hive-partition-key+ Name of a hive field to partition are \
sharded on
+\--hive-partition-value <v>+ String-value that serves as partition key\
Modified: incubator/sqoop/trunk/src/docs/user/hive.txt
URL: http://svn.apache.org/viewvc/incubator/sqoop/trunk/src/docs/user/hive.txt?rev=1161382&r1=1161381&r2=1161382&view=diff
==============================================================================
--- incubator/sqoop/trunk/src/docs/user/hive.txt (original)
+++ incubator/sqoop/trunk/src/docs/user/hive.txt Thu Aug 25 04:52:09 2011
@@ -58,8 +58,11 @@ rows contain string fields that have Hiv
(+\n+ and +\r+ characters) or column delimiters (+\01+ characters)
present in them. You can use the +\--hive-drop-import-delims+ option
to drop those characters on import to give Hive-compatible text data.
-This option should only be used if you use Hive's default delimiters
-and should not be used if different delimiters are specified.
+Alternatively, you can use the +\--hive-delims-replacement+ option
+to replace those characters with a user-defined string on import to give
+Hive-compatible text data. These options should only be used if you use
+Hive's default delimiters and should not be used if different delimiters
+are specified.
Sqoop will pass the field and record delimiters through to Hive. If you do
not set any delimiters and do use +\--hive-import+, the field delimiter will
Modified: incubator/sqoop/trunk/src/java/com/cloudera/sqoop/SqoopOptions.java
URL: http://svn.apache.org/viewvc/incubator/sqoop/trunk/src/java/com/cloudera/sqoop/SqoopOptions.java?rev=1161382&r1=1161381&r2=1161382&view=diff
==============================================================================
--- incubator/sqoop/trunk/src/java/com/cloudera/sqoop/SqoopOptions.java (original)
+++ incubator/sqoop/trunk/src/java/com/cloudera/sqoop/SqoopOptions.java Thu Aug 25 04:52:09 2011
@@ -152,6 +152,8 @@ public class SqoopOptions implements Clo
private boolean failIfHiveTableExists;
@StoredAsProperty("hive.table.name") private String hiveTableName;
@StoredAsProperty("hive.drop.delims") private boolean hiveDropDelims;
+ @StoredAsProperty("hive.delims.replacement")
+ private String hiveDelimsReplacement;
@StoredAsProperty("hive.partition.key") private String hivePartitionKey;
@StoredAsProperty("hive.partition.value") private String hivePartitionValue;
@@ -1101,6 +1103,18 @@ public class SqoopOptions implements Clo
}
/**
+ * @return the user-specified option to specify the replacement string
+ * for hive delimeters
+ */
+ public String getHiveDelimsReplacement() {
+ return hiveDelimsReplacement;
+ }
+
+ public void setHiveDelimsReplacement(String replacement) {
+ this.hiveDelimsReplacement = replacement;
+ }
+
+ /**
* @return the user-specified option to specify sqoop's behavior during
* target table creation if the table exists.
*/
Modified: incubator/sqoop/trunk/src/java/com/cloudera/sqoop/lib/FieldFormatter.java
URL: http://svn.apache.org/viewvc/incubator/sqoop/trunk/src/java/com/cloudera/sqoop/lib/FieldFormatter.java?rev=1161382&r1=1161381&r2=1161382&view=diff
==============================================================================
--- incubator/sqoop/trunk/src/java/com/cloudera/sqoop/lib/FieldFormatter.java (original)
+++ incubator/sqoop/trunk/src/java/com/cloudera/sqoop/lib/FieldFormatter.java Thu Aug 25 04:52:09 2011
@@ -32,8 +32,20 @@ public final class FieldFormatter {
* @return
*/
public static String hiveStringDropDelims(String str,
+ DelimiterSet delimiters) {
+ return hiveStringReplaceDelims(str, "", delimiters);
+ }
+
+ /**
+ * replace hive delimiters with a user-defined string passed to the
+ * --hive-delims-replacement option.
+ * @param str
+ * @param delimiters
+ * @return
+ */
+ public static String hiveStringReplaceDelims(String str, String replacement,
DelimiterSet delimiters) {
- String droppedDelims = str.replaceAll("\\n|\\r|\01", "");
+ String droppedDelims = str.replaceAll("\\n|\\r|\01", replacement);
return escapeAndEnclose(droppedDelims, delimiters);
}
Modified: incubator/sqoop/trunk/src/java/com/cloudera/sqoop/orm/ClassWriter.java
URL: http://svn.apache.org/viewvc/incubator/sqoop/trunk/src/java/com/cloudera/sqoop/orm/ClassWriter.java?rev=1161382&r1=1161381&r2=1161382&view=diff
==============================================================================
--- incubator/sqoop/trunk/src/java/com/cloudera/sqoop/orm/ClassWriter.java (original)
+++ incubator/sqoop/trunk/src/java/com/cloudera/sqoop/orm/ClassWriter.java Thu Aug 25 04:52:09 2011
@@ -826,10 +826,18 @@ public class ClassWriter {
}
if (javaType.equals("String") && options.doHiveDropDelims()) {
- sb.append(" // special case for strings hive, dropping delimiters "
- + "\\n,\\r,\\01 from strings\n");
- sb.append(" __sb.append(FieldFormatter.hiveStringDropDelims("
- + stringExpr + ", delimiters));\n");
+ sb.append(" // special case for strings hive, dropping"
+ + "delimiters \\n,\\r,\\01 from strings\n");
+ sb.append(" __sb.append(FieldFormatter.hiveStringDropDelims("
+ + stringExpr + ", delimiters));\n");
+ } else if (javaType.equals("String")
+ && options.getHiveDelimsReplacement() != null) {
+ sb.append(" // special case for strings hive, replacing "
+ + "delimiters \\n,\\r,\\01 with '"
+ + options.getHiveDelimsReplacement() + "' from strings\n");
+ sb.append(" __sb.append(FieldFormatter.hiveStringReplaceDelims("
+ + stringExpr + ", \"" + options.getHiveDelimsReplacement() + "\", "
+ + "delimiters));\n");
} else {
sb.append(" __sb.append(FieldFormatter.escapeAndEnclose("
+ stringExpr + ", delimiters));\n");
Modified: incubator/sqoop/trunk/src/java/com/cloudera/sqoop/tool/BaseSqoopTool.java
URL: http://svn.apache.org/viewvc/incubator/sqoop/trunk/src/java/com/cloudera/sqoop/tool/BaseSqoopTool.java?rev=1161382&r1=1161381&r2=1161382&view=diff
==============================================================================
--- incubator/sqoop/trunk/src/java/com/cloudera/sqoop/tool/BaseSqoopTool.java (original)
+++ incubator/sqoop/trunk/src/java/com/cloudera/sqoop/tool/BaseSqoopTool.java Thu Aug 25 04:52:09 2011
@@ -98,6 +98,8 @@ public abstract class BaseSqoopTool exte
public static final String HIVE_TABLE_ARG = "hive-table";
public static final String HIVE_OVERWRITE_ARG = "hive-overwrite";
public static final String HIVE_DROP_DELIMS_ARG = "hive-drop-import-delims";
+ public static final String HIVE_DELIMS_REPLACEMENT_ARG =
+ "hive-delims-replacement";
public static final String HIVE_PARTITION_KEY_ARG = "hive-partition-key";
public static final String HIVE_PARTITION_VALUE_ARG = "hive-partition-value";
public static final String CREATE_HIVE_TABLE_ARG =
@@ -426,6 +428,12 @@ public abstract class BaseSqoopTool exte
+ "(\\n\\r) from imported string fields")
.withLongOpt(HIVE_DROP_DELIMS_ARG)
.create());
+ hiveOpts.addOption(OptionBuilder
+ .hasArg()
+ .withDescription("Replace Hive record \\0x01 and row delimiters "
+ + "(\\n\\r) from imported string fields with user-defined string")
+ .withLongOpt(HIVE_DELIMS_REPLACEMENT_ARG)
+ .create());
hiveOpts.addOption(OptionBuilder.withArgName("partition-key")
.hasArg()
.withDescription("Sets the partition key to use when importing to hive")
@@ -729,6 +737,11 @@ public abstract class BaseSqoopTool exte
out.setHiveDropDelims(true);
}
+ if (in.hasOption(HIVE_DELIMS_REPLACEMENT_ARG)) {
+ out.setHiveDelimsReplacement(
+ in.getOptionValue(HIVE_DELIMS_REPLACEMENT_ARG));
+ }
+
if (in.hasOption(HIVE_PARTITION_KEY_ARG)) {
out.setHivePartitionKey(in.getOptionValue(HIVE_PARTITION_KEY_ARG));
}
@@ -894,6 +907,12 @@ public abstract class BaseSqoopTool exte
throws InvalidOptionsException {
// Empty; this method is present to maintain API consistency, and
// is reserved for future constraints on Hive options.
+ if (options.getHiveDelimsReplacement() != null
+ && options.doHiveDropDelims()) {
+ throw new InvalidOptionsException("The " + HIVE_DROP_DELIMS_ARG
+ + " option conflicts with the " + HIVE_DELIMS_REPLACEMENT_ARG
+ + " option." + HELP_STR);
+ }
}
protected void validateHBaseOptions(SqoopOptions options)
Modified: incubator/sqoop/trunk/src/java/com/cloudera/sqoop/tool/ImportTool.java
URL: http://svn.apache.org/viewvc/incubator/sqoop/trunk/src/java/com/cloudera/sqoop/tool/ImportTool.java?rev=1161382&r1=1161381&r2=1161382&view=diff
==============================================================================
--- incubator/sqoop/trunk/src/java/com/cloudera/sqoop/tool/ImportTool.java (original)
+++ incubator/sqoop/trunk/src/java/com/cloudera/sqoop/tool/ImportTool.java Thu Aug 25 04:52:09 2011
@@ -839,6 +839,7 @@ public class ImportTool extends BaseSqoo
validateCodeGenOptions(options);
validateOutputFormatOptions(options);
validateHBaseOptions(options);
+ validateHiveOptions(options);
}
}
Modified: incubator/sqoop/trunk/src/test/com/cloudera/sqoop/hive/TestHiveImport.java
URL: http://svn.apache.org/viewvc/incubator/sqoop/trunk/src/test/com/cloudera/sqoop/hive/TestHiveImport.java?rev=1161382&r1=1161381&r2=1161382&view=diff
==============================================================================
--- incubator/sqoop/trunk/src/test/com/cloudera/sqoop/hive/TestHiveImport.java (original)
+++ incubator/sqoop/trunk/src/test/com/cloudera/sqoop/hive/TestHiveImport.java Thu Aug 25 04:52:09 2011
@@ -32,6 +32,7 @@ import org.apache.hadoop.fs.Path;
import org.junit.Test;
import com.cloudera.sqoop.SqoopOptions;
+import com.cloudera.sqoop.SqoopOptions.InvalidOptionsException;
import com.cloudera.sqoop.testutil.CommonArgs;
import com.cloudera.sqoop.testutil.HsqldbTestServer;
import com.cloudera.sqoop.testutil.ImportJobTestCase;
@@ -40,6 +41,7 @@ import com.cloudera.sqoop.tool.CodeGenTo
import com.cloudera.sqoop.tool.CreateHiveTableTool;
import com.cloudera.sqoop.tool.ImportTool;
import com.cloudera.sqoop.tool.SqoopTool;
+import org.apache.commons.cli.ParseException;
/**
* Test HiveImport capability after an import to HDFS.
@@ -363,6 +365,77 @@ public class TestHiveImport extends Impo
* Test hive import with row that has new line in it.
*/
@Test
+ public void testFieldWithHiveDelimsReplacement() throws IOException,
+ InterruptedException {
+ final String TABLE_NAME = "FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT";
+
+ LOG.info("Doing import of single row into "
+ + "FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT table");
+ setCurTableName(TABLE_NAME);
+ setNumCols(3);
+ String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
+ String[] vals = { "'test with\nnew lines\n'", "42",
+ "'oh no " + '\01' + " field delims " + '\01' + "'", };
+ String[] moreArgs = { "--"+BaseSqoopTool.HIVE_DELIMS_REPLACEMENT_ARG, " "};
+
+ runImportTest(TABLE_NAME, types, vals,
+ "fieldWithNewlineReplacementImport.q", getArgv(false, moreArgs),
+ new ImportTool());
+
+ LOG.info("Validating data in single row is present in: "
+ + "FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT table");
+
+ // Ideally, we would actually invoke hive code to verify that record with
+ // record and field delimiters have values replaced and that we have the
+ // proper number of hive records. Unfortunately, this is a non-trivial task,
+ // and better dealt with at an integration test level
+ //
+ // Instead, this assumes the path of the generated table and just validate
+ // map job output.
+
+ // Get and read the raw output file
+ String whDir = getWarehouseDir();
+ File p = new File(new File(whDir, TABLE_NAME), "part-m-00000");
+ File f = new File(p.toString());
+ FileReader fr = new FileReader(f);
+ BufferedReader br = new BufferedReader(fr);
+ try {
+ // verify the output
+ assertEquals(br.readLine(), "test with new lines " + '\01' + "42"
+ + '\01' + "oh no field delims ");
+ assertEquals(br.readLine(), null); // should only be one line
+ } catch (IOException ioe) {
+ fail("Unable to read files generated from hive");
+ } finally {
+ br.close();
+ }
+ }
+
+ /**
+ * Test hive drop and replace option validation.
+ */
+ @Test
+ public void testHiveDropAndReplaceOptionValidation() throws ParseException {
+ LOG.info("Testing conflicting Hive delimiter drop/replace options");
+
+ setNumCols(3);
+ String[] moreArgs = { "--"+BaseSqoopTool.HIVE_DELIMS_REPLACEMENT_ARG, " ",
+ "--"+BaseSqoopTool.HIVE_DROP_DELIMS_ARG, };
+
+ ImportTool tool = new ImportTool();
+ try {
+ tool.validateOptions(tool.parseArguments(getArgv(false, moreArgs), null,
+ null, true));
+ fail("Expected InvalidOptionsException");
+ } catch (InvalidOptionsException ex) {
+ /* success */
+ }
+ }
+
+ /**
+ * Test hive import with row that has new line in it.
+ */
+ @Test
public void testImportHiveWithPartitions() throws IOException,
InterruptedException {
final String TABLE_NAME = "PARTITION_HIVE_IMPORT";
Added: incubator/sqoop/trunk/testdata/hive/scripts/fieldWithNewlineReplacementImport.q
URL: http://svn.apache.org/viewvc/incubator/sqoop/trunk/testdata/hive/scripts/fieldWithNewlineReplacementImport.q?rev=1161382&view=auto
==============================================================================
--- incubator/sqoop/trunk/testdata/hive/scripts/fieldWithNewlineReplacementImport.q (added)
+++ incubator/sqoop/trunk/testdata/hive/scripts/fieldWithNewlineReplacementImport.q Thu Aug 25 04:52:09 2011
@@ -0,0 +1,2 @@
+CREATE TABLE IF NOT EXISTS `FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT` ( `DATA_COL0` STRING, `DATA_COL1` INT, `DATA_COL2` STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' LINES TERMINATED BY '\012' STORED AS TEXTFILE;
+LOAD DATA INPATH 'file:BASEPATH/sqoop/warehouse/FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT' INTO TABLE `FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT`;