You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@sqoop.apache.org by gw...@apache.org on 2015/04/08 04:39:53 UTC

sqoop git commit: SQOOP-2286: Ensure Sqoop generates valid avro column names

Repository: sqoop
Updated Branches:
  refs/heads/trunk d32137f15 -> baf513512


SQOOP-2286: Ensure Sqoop generates valid avro column names

(Abraham Elmahrek via Gwen Shapira)


Project: http://git-wip-us.apache.org/repos/asf/sqoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/sqoop/commit/baf51351
Tree: http://git-wip-us.apache.org/repos/asf/sqoop/tree/baf51351
Diff: http://git-wip-us.apache.org/repos/asf/sqoop/diff/baf51351

Branch: refs/heads/trunk
Commit: baf51351281842bd660572fcc05c89d6407913c5
Parents: d32137f
Author: Gwen Shapira <cs...@gmail.com>
Authored: Tue Apr 7 19:39:02 2015 -0700
Committer: Gwen Shapira <cs...@gmail.com>
Committed: Tue Apr 7 19:39:02 2015 -0700

----------------------------------------------------------------------
 src/java/org/apache/sqoop/avro/AvroUtil.java    | 23 +++++++++++++++++++-
 .../apache/sqoop/orm/AvroSchemaGenerator.java   |  3 ++-
 src/test/com/cloudera/sqoop/TestAvroImport.java | 21 ++++++++++++++++++
 3 files changed, 45 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/sqoop/blob/baf51351/src/java/org/apache/sqoop/avro/AvroUtil.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/sqoop/avro/AvroUtil.java b/src/java/org/apache/sqoop/avro/AvroUtil.java
index 2fdf263..ee3cf62 100644
--- a/src/java/org/apache/sqoop/avro/AvroUtil.java
+++ b/src/java/org/apache/sqoop/avro/AvroUtil.java
@@ -24,6 +24,7 @@ import org.apache.avro.generic.GenericRecord;
 import org.apache.hadoop.io.BytesWritable;
 import org.apache.sqoop.lib.BlobRef;
 import org.apache.sqoop.lib.ClobRef;
+import org.apache.sqoop.orm.ClassWriter;
 
 import java.math.BigDecimal;
 import java.nio.ByteBuffer;
@@ -72,6 +73,25 @@ public final class AvroUtil {
   }
 
   /**
+   * Convert Column name into Avro column name.
+   */
+  public static String toAvroColumn(String column) {
+    return toAvroIdentifier(column);
+  }
+
+  /**
+   * Format candidate to avro specifics
+   */
+  public static String toAvroIdentifier(String candidate) {
+    String formattedCandidate = candidate.replaceAll("\\W+", "");
+    if (formattedCandidate.substring(0,1).matches("[a-zA-Z_]")) {
+      return formattedCandidate;
+    } else {
+      return "AVRO_" + formattedCandidate;
+    }
+  }
+
+  /**
    * Manipulate a GenericRecord instance.
    */
   public static GenericRecord toGenericRecord(Map<String, Object> fieldMap,
@@ -79,7 +99,8 @@ public final class AvroUtil {
     GenericRecord record = new GenericData.Record(schema);
     for (Map.Entry<String, Object> entry : fieldMap.entrySet()) {
       Object avroObject = toAvro(entry.getValue(), bigDecimalFormatString);
-      record.put(entry.getKey(), avroObject);
+      String avroColumn = toAvroColumn(entry.getKey());
+      record.put(avroColumn, avroObject);
     }
     return record;
   }

http://git-wip-us.apache.org/repos/asf/sqoop/blob/baf51351/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java b/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java
index 3c913a8..a73aa13 100644
--- a/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java
+++ b/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java
@@ -32,6 +32,7 @@ import org.apache.commons.logging.LogFactory;
 
 import com.cloudera.sqoop.SqoopOptions;
 import com.cloudera.sqoop.manager.ConnManager;
+import org.apache.sqoop.avro.AvroUtil;
 
 /**
  * Creates an Avro schema to represent a table from a database.
@@ -60,7 +61,7 @@ public class AvroSchemaGenerator {
 
     List<Field> fields = new ArrayList<Field>();
     for (String columnName : columnNames) {
-      String cleanedCol = ClassWriter.toJavaIdentifier(columnName);
+      String cleanedCol = AvroUtil.toAvroIdentifier(ClassWriter.toJavaIdentifier(columnName));
       int sqlType = columnTypes.get(columnName);
       Schema avroSchema = toAvroSchema(sqlType, columnName);
       Field field = new Field(cleanedCol, avroSchema, null, null);

http://git-wip-us.apache.org/repos/asf/sqoop/blob/baf51351/src/test/com/cloudera/sqoop/TestAvroImport.java
----------------------------------------------------------------------
diff --git a/src/test/com/cloudera/sqoop/TestAvroImport.java b/src/test/com/cloudera/sqoop/TestAvroImport.java
index dd051f3..08b8aa9 100644
--- a/src/test/com/cloudera/sqoop/TestAvroImport.java
+++ b/src/test/com/cloudera/sqoop/TestAvroImport.java
@@ -206,6 +206,27 @@ public class TestAvroImport extends ImportJobTestCase {
     assertEquals("__NAME", 1987, record1.get("__NAME"));
   }
 
+  public void testNonstandardCharactersInColumnName() throws IOException {
+    String [] names = { "avroƄ1" };
+    String [] types = { "INT" };
+    String [] vals = { "1987" };
+    createTableWithColTypesAndNames(names, types, vals);
+
+    runImport(getOutputArgv(true, null));
+
+    Path outputFile = new Path(getTablePath(), "part-m-00000.avro");
+    DataFileReader<GenericRecord> reader = read(outputFile);
+    Schema schema = reader.getSchema();
+    assertEquals(Schema.Type.RECORD, schema.getType());
+    List<Field> fields = schema.getFields();
+    assertEquals(types.length, fields.size());
+
+    checkField(fields.get(0), "AVRO1", Type.INT);
+
+    GenericRecord record1 = reader.next();
+    assertEquals("AVRO1", 1987, record1.get("AVRO1"));
+  }
+
   private void checkField(Field field, String name, Type type) {
     assertEquals(name, field.name());
     assertEquals(Schema.Type.UNION, field.schema().getType());