You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@sqoop.apache.org by ar...@apache.org on 2011/08/25 06:52:09 UTC

svn commit: r1161382 - in /incubator/sqoop/trunk: src/docs/user/ src/java/com/cloudera/sqoop/ src/java/com/cloudera/sqoop/lib/ src/java/com/cloudera/sqoop/orm/ src/java/com/cloudera/sqoop/tool/ src/test/com/cloudera/sqoop/hive/ testdata/hive/scripts/

Author: arvind
Date: Thu Aug 25 04:52:09 2011
New Revision: 1161382

URL: http://svn.apache.org/viewvc?rev=1161382&view=rev
Log:
SQOOP-319. Support for replacing Hive delimiters.

(Joey Echeverria via Arvind Prabhakar)

Added:
    incubator/sqoop/trunk/testdata/hive/scripts/fieldWithNewlineReplacementImport.q
Modified:
    incubator/sqoop/trunk/src/docs/user/hive-args.txt
    incubator/sqoop/trunk/src/docs/user/hive.txt
    incubator/sqoop/trunk/src/java/com/cloudera/sqoop/SqoopOptions.java
    incubator/sqoop/trunk/src/java/com/cloudera/sqoop/lib/FieldFormatter.java
    incubator/sqoop/trunk/src/java/com/cloudera/sqoop/orm/ClassWriter.java
    incubator/sqoop/trunk/src/java/com/cloudera/sqoop/tool/BaseSqoopTool.java
    incubator/sqoop/trunk/src/java/com/cloudera/sqoop/tool/ImportTool.java
    incubator/sqoop/trunk/src/test/com/cloudera/sqoop/hive/TestHiveImport.java

Modified: incubator/sqoop/trunk/src/docs/user/hive-args.txt
URL: http://svn.apache.org/viewvc/incubator/sqoop/trunk/src/docs/user/hive-args.txt?rev=1161382&r1=1161381&r2=1161382&view=diff
==============================================================================
--- incubator/sqoop/trunk/src/docs/user/hive-args.txt (original)
+++ incubator/sqoop/trunk/src/docs/user/hive-args.txt Thu Aug 25 04:52:09 2011
@@ -32,6 +32,8 @@ Argument                      Descriptio
                               to Hive.
 +\--hive-drop-import-delims+  Drops '\n', '\r', and '\01' from string\
 			      fields when importing to Hive.
++\--hive-delims-replacement+  Replace '\n', '\r', and '\01' from string\
+			      fields with user defined string when importing to Hive.
 +\--hive-partition-key+	      Name of a hive field to partition are \
 			      sharded on
 +\--hive-partition-value <v>+ String-value that serves as partition key\

Modified: incubator/sqoop/trunk/src/docs/user/hive.txt
URL: http://svn.apache.org/viewvc/incubator/sqoop/trunk/src/docs/user/hive.txt?rev=1161382&r1=1161381&r2=1161382&view=diff
==============================================================================
--- incubator/sqoop/trunk/src/docs/user/hive.txt (original)
+++ incubator/sqoop/trunk/src/docs/user/hive.txt Thu Aug 25 04:52:09 2011
@@ -58,8 +58,11 @@ rows contain string fields that have Hiv
 (+\n+ and +\r+ characters) or column delimiters (+\01+ characters)
 present in them.  You can use the +\--hive-drop-import-delims+ option
 to drop those characters on import to give Hive-compatible text data.
-This option should only be used if you use Hive's default delimiters
-and should not be used if different delimiters are specified.
+Alternatively, you can use the +\--hive-delims-replacement+ option
+to replace those characters with a user-defined string on import to give
+Hive-compatible text data.  These options should only be used if you use
+Hive's default delimiters and should not be used if different delimiters
+are specified.
 
 Sqoop will pass the field and record delimiters through to Hive. If you do
 not set any delimiters and do use +\--hive-import+, the field delimiter will

Modified: incubator/sqoop/trunk/src/java/com/cloudera/sqoop/SqoopOptions.java
URL: http://svn.apache.org/viewvc/incubator/sqoop/trunk/src/java/com/cloudera/sqoop/SqoopOptions.java?rev=1161382&r1=1161381&r2=1161382&view=diff
==============================================================================
--- incubator/sqoop/trunk/src/java/com/cloudera/sqoop/SqoopOptions.java (original)
+++ incubator/sqoop/trunk/src/java/com/cloudera/sqoop/SqoopOptions.java Thu Aug 25 04:52:09 2011
@@ -152,6 +152,8 @@ public class SqoopOptions implements Clo
   private boolean failIfHiveTableExists;
   @StoredAsProperty("hive.table.name") private String hiveTableName;
   @StoredAsProperty("hive.drop.delims") private boolean hiveDropDelims;
+  @StoredAsProperty("hive.delims.replacement")
+  private String hiveDelimsReplacement;
   @StoredAsProperty("hive.partition.key") private String hivePartitionKey;
   @StoredAsProperty("hive.partition.value") private String hivePartitionValue;
 
@@ -1101,6 +1103,18 @@ public class SqoopOptions implements Clo
   }
 
   /**
+   * @return the user-specified option to specify the replacement string
+   *         for hive delimeters
+   */
+  public String getHiveDelimsReplacement() {
+    return hiveDelimsReplacement;
+  }
+
+  public void setHiveDelimsReplacement(String replacement) {
+    this.hiveDelimsReplacement = replacement;
+  }
+
+  /**
    * @return the user-specified option to specify sqoop's behavior during
    *         target table creation if the table exists.
    */

Modified: incubator/sqoop/trunk/src/java/com/cloudera/sqoop/lib/FieldFormatter.java
URL: http://svn.apache.org/viewvc/incubator/sqoop/trunk/src/java/com/cloudera/sqoop/lib/FieldFormatter.java?rev=1161382&r1=1161381&r2=1161382&view=diff
==============================================================================
--- incubator/sqoop/trunk/src/java/com/cloudera/sqoop/lib/FieldFormatter.java (original)
+++ incubator/sqoop/trunk/src/java/com/cloudera/sqoop/lib/FieldFormatter.java Thu Aug 25 04:52:09 2011
@@ -32,8 +32,20 @@ public final class FieldFormatter {
    * @return
    */
   public static String hiveStringDropDelims(String str,
+          DelimiterSet delimiters) {
+    return hiveStringReplaceDelims(str, "", delimiters);
+  }
+
+  /**
+   * replace hive delimiters with a user-defined string passed to the
+   * --hive-delims-replacement option.
+   * @param str
+   * @param delimiters
+   * @return
+   */
+  public static String hiveStringReplaceDelims(String str, String replacement,
       DelimiterSet delimiters) {
-    String droppedDelims = str.replaceAll("\\n|\\r|\01", "");
+    String droppedDelims = str.replaceAll("\\n|\\r|\01", replacement);
     return escapeAndEnclose(droppedDelims, delimiters);
   }
 

Modified: incubator/sqoop/trunk/src/java/com/cloudera/sqoop/orm/ClassWriter.java
URL: http://svn.apache.org/viewvc/incubator/sqoop/trunk/src/java/com/cloudera/sqoop/orm/ClassWriter.java?rev=1161382&r1=1161381&r2=1161382&view=diff
==============================================================================
--- incubator/sqoop/trunk/src/java/com/cloudera/sqoop/orm/ClassWriter.java (original)
+++ incubator/sqoop/trunk/src/java/com/cloudera/sqoop/orm/ClassWriter.java Thu Aug 25 04:52:09 2011
@@ -826,10 +826,18 @@ public class ClassWriter {
       }
 
       if (javaType.equals("String") && options.doHiveDropDelims()) {
-          sb.append("    // special case for strings hive, dropping delimiters "
-              + "\\n,\\r,\\01 from strings\n");
-          sb.append("    __sb.append(FieldFormatter.hiveStringDropDelims("
-              + stringExpr + ", delimiters));\n");
+        sb.append("    // special case for strings hive, dropping"
+          + "delimiters \\n,\\r,\\01 from strings\n");
+        sb.append("    __sb.append(FieldFormatter.hiveStringDropDelims("
+          + stringExpr + ", delimiters));\n");
+      } else if (javaType.equals("String")
+        && options.getHiveDelimsReplacement() != null) {
+        sb.append("    // special case for strings hive, replacing "
+          + "delimiters \\n,\\r,\\01 with '"
+          + options.getHiveDelimsReplacement() + "' from strings\n");
+        sb.append("    __sb.append(FieldFormatter.hiveStringReplaceDelims("
+          + stringExpr + ", \"" + options.getHiveDelimsReplacement() + "\", "
+          + "delimiters));\n");
       } else {
         sb.append("    __sb.append(FieldFormatter.escapeAndEnclose("
             + stringExpr + ", delimiters));\n");

Modified: incubator/sqoop/trunk/src/java/com/cloudera/sqoop/tool/BaseSqoopTool.java
URL: http://svn.apache.org/viewvc/incubator/sqoop/trunk/src/java/com/cloudera/sqoop/tool/BaseSqoopTool.java?rev=1161382&r1=1161381&r2=1161382&view=diff
==============================================================================
--- incubator/sqoop/trunk/src/java/com/cloudera/sqoop/tool/BaseSqoopTool.java (original)
+++ incubator/sqoop/trunk/src/java/com/cloudera/sqoop/tool/BaseSqoopTool.java Thu Aug 25 04:52:09 2011
@@ -98,6 +98,8 @@ public abstract class BaseSqoopTool exte
   public static final String HIVE_TABLE_ARG = "hive-table";
   public static final String HIVE_OVERWRITE_ARG = "hive-overwrite";
   public static final String HIVE_DROP_DELIMS_ARG = "hive-drop-import-delims";
+  public static final String HIVE_DELIMS_REPLACEMENT_ARG =
+          "hive-delims-replacement";
   public static final String HIVE_PARTITION_KEY_ARG = "hive-partition-key";
   public static final String HIVE_PARTITION_VALUE_ARG = "hive-partition-value";
   public static final String CREATE_HIVE_TABLE_ARG =
@@ -426,6 +428,12 @@ public abstract class BaseSqoopTool exte
             + "(\\n\\r) from imported string fields")
         .withLongOpt(HIVE_DROP_DELIMS_ARG)
         .create());
+    hiveOpts.addOption(OptionBuilder
+        .hasArg()
+        .withDescription("Replace Hive record \\0x01 and row delimiters "
+            + "(\\n\\r) from imported string fields with user-defined string")
+        .withLongOpt(HIVE_DELIMS_REPLACEMENT_ARG)
+        .create());
     hiveOpts.addOption(OptionBuilder.withArgName("partition-key")
         .hasArg()
         .withDescription("Sets the partition key to use when importing to hive")
@@ -729,6 +737,11 @@ public abstract class BaseSqoopTool exte
       out.setHiveDropDelims(true);
     }
 
+    if (in.hasOption(HIVE_DELIMS_REPLACEMENT_ARG)) {
+      out.setHiveDelimsReplacement(
+              in.getOptionValue(HIVE_DELIMS_REPLACEMENT_ARG));
+    }
+
     if (in.hasOption(HIVE_PARTITION_KEY_ARG)) {
       out.setHivePartitionKey(in.getOptionValue(HIVE_PARTITION_KEY_ARG));
     }
@@ -894,6 +907,12 @@ public abstract class BaseSqoopTool exte
       throws InvalidOptionsException {
     // Empty; this method is present to maintain API consistency, and
     // is reserved for future constraints on Hive options.
+    if (options.getHiveDelimsReplacement() != null
+            && options.doHiveDropDelims()) {
+      throw new InvalidOptionsException("The " + HIVE_DROP_DELIMS_ARG
+              + " option conflicts with the " + HIVE_DELIMS_REPLACEMENT_ARG
+              + " option." + HELP_STR);
+    }
   }
 
   protected void validateHBaseOptions(SqoopOptions options)

Modified: incubator/sqoop/trunk/src/java/com/cloudera/sqoop/tool/ImportTool.java
URL: http://svn.apache.org/viewvc/incubator/sqoop/trunk/src/java/com/cloudera/sqoop/tool/ImportTool.java?rev=1161382&r1=1161381&r2=1161382&view=diff
==============================================================================
--- incubator/sqoop/trunk/src/java/com/cloudera/sqoop/tool/ImportTool.java (original)
+++ incubator/sqoop/trunk/src/java/com/cloudera/sqoop/tool/ImportTool.java Thu Aug 25 04:52:09 2011
@@ -839,6 +839,7 @@ public class ImportTool extends BaseSqoo
     validateCodeGenOptions(options);
     validateOutputFormatOptions(options);
     validateHBaseOptions(options);
+    validateHiveOptions(options);
   }
 }
 

Modified: incubator/sqoop/trunk/src/test/com/cloudera/sqoop/hive/TestHiveImport.java
URL: http://svn.apache.org/viewvc/incubator/sqoop/trunk/src/test/com/cloudera/sqoop/hive/TestHiveImport.java?rev=1161382&r1=1161381&r2=1161382&view=diff
==============================================================================
--- incubator/sqoop/trunk/src/test/com/cloudera/sqoop/hive/TestHiveImport.java (original)
+++ incubator/sqoop/trunk/src/test/com/cloudera/sqoop/hive/TestHiveImport.java Thu Aug 25 04:52:09 2011
@@ -32,6 +32,7 @@ import org.apache.hadoop.fs.Path;
 import org.junit.Test;
 
 import com.cloudera.sqoop.SqoopOptions;
+import com.cloudera.sqoop.SqoopOptions.InvalidOptionsException;
 import com.cloudera.sqoop.testutil.CommonArgs;
 import com.cloudera.sqoop.testutil.HsqldbTestServer;
 import com.cloudera.sqoop.testutil.ImportJobTestCase;
@@ -40,6 +41,7 @@ import com.cloudera.sqoop.tool.CodeGenTo
 import com.cloudera.sqoop.tool.CreateHiveTableTool;
 import com.cloudera.sqoop.tool.ImportTool;
 import com.cloudera.sqoop.tool.SqoopTool;
+import org.apache.commons.cli.ParseException;
 
 /**
  * Test HiveImport capability after an import to HDFS.
@@ -363,6 +365,77 @@ public class TestHiveImport extends Impo
    * Test hive import with row that has new line in it.
    */
   @Test
+  public void testFieldWithHiveDelimsReplacement() throws IOException,
+    InterruptedException {
+    final String TABLE_NAME = "FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT";
+
+    LOG.info("Doing import of single row into "
+        + "FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT table");
+    setCurTableName(TABLE_NAME);
+    setNumCols(3);
+    String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
+    String[] vals = { "'test with\nnew lines\n'", "42",
+        "'oh no " + '\01' + " field delims " + '\01' + "'", };
+    String[] moreArgs = { "--"+BaseSqoopTool.HIVE_DELIMS_REPLACEMENT_ARG, " "};
+
+    runImportTest(TABLE_NAME, types, vals,
+        "fieldWithNewlineReplacementImport.q", getArgv(false, moreArgs),
+        new ImportTool());
+
+    LOG.info("Validating data in single row is present in: "
+          + "FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT table");
+
+    // Ideally, we would actually invoke hive code to verify that record with
+    // record and field delimiters have values replaced and that we have the
+    // proper number of hive records. Unfortunately, this is a non-trivial task,
+    // and better dealt with at an integration test level
+    //
+    // Instead, this assumes the path of the generated table and just validate
+    // map job output.
+
+    // Get and read the raw output file
+    String whDir = getWarehouseDir();
+    File p = new File(new File(whDir, TABLE_NAME), "part-m-00000");
+    File f = new File(p.toString());
+    FileReader fr = new FileReader(f);
+    BufferedReader br = new BufferedReader(fr);
+    try {
+      // verify the output
+      assertEquals(br.readLine(), "test with new lines " + '\01' + "42"
+          + '\01' + "oh no   field delims  ");
+      assertEquals(br.readLine(), null); // should only be one line
+    } catch (IOException ioe) {
+      fail("Unable to read files generated from hive");
+    } finally {
+      br.close();
+    }
+  }
+
+  /**
+   * Test hive drop and replace option validation.
+   */
+  @Test
+  public void testHiveDropAndReplaceOptionValidation() throws ParseException {
+    LOG.info("Testing conflicting Hive delimiter drop/replace options");
+
+    setNumCols(3);
+    String[] moreArgs = { "--"+BaseSqoopTool.HIVE_DELIMS_REPLACEMENT_ARG, " ",
+      "--"+BaseSqoopTool.HIVE_DROP_DELIMS_ARG, };
+
+    ImportTool tool = new ImportTool();
+    try {
+      tool.validateOptions(tool.parseArguments(getArgv(false, moreArgs), null,
+          null, true));
+      fail("Expected InvalidOptionsException");
+    } catch (InvalidOptionsException ex) {
+      /* success */
+    }
+  }
+
+  /**
+   * Test hive import with row that has new line in it.
+   */
+  @Test
   public void testImportHiveWithPartitions() throws IOException,
       InterruptedException {
     final String TABLE_NAME = "PARTITION_HIVE_IMPORT";

Added: incubator/sqoop/trunk/testdata/hive/scripts/fieldWithNewlineReplacementImport.q
URL: http://svn.apache.org/viewvc/incubator/sqoop/trunk/testdata/hive/scripts/fieldWithNewlineReplacementImport.q?rev=1161382&view=auto
==============================================================================
--- incubator/sqoop/trunk/testdata/hive/scripts/fieldWithNewlineReplacementImport.q (added)
+++ incubator/sqoop/trunk/testdata/hive/scripts/fieldWithNewlineReplacementImport.q Thu Aug 25 04:52:09 2011
@@ -0,0 +1,2 @@
+CREATE TABLE IF NOT EXISTS `FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT` ( `DATA_COL0` STRING, `DATA_COL1` INT, `DATA_COL2` STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' LINES TERMINATED BY '\012' STORED AS TEXTFILE;
+LOAD DATA INPATH 'file:BASEPATH/sqoop/warehouse/FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT' INTO TABLE `FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT`;