You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2018/03/02 18:39:34 UTC

orc git commit: ORC-308. Add function to get subtypes by name.

Repository: orc
Updated Branches:
  refs/heads/master 411e633a2 -> 51b6b6ce3


ORC-308. Add function to get subtypes by name.

Fixes #221

Signed-off-by: Owen O'Malley <om...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/51b6b6ce
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/51b6b6ce
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/51b6b6ce

Branch: refs/heads/master
Commit: 51b6b6ce3c63e3da326978fa016f72247b9f6c0d
Parents: 411e633
Author: Owen O'Malley <om...@apache.org>
Authored: Tue Feb 27 16:02:16 2018 -0800
Committer: Owen O'Malley <om...@apache.org>
Committed: Fri Mar 2 10:37:11 2018 -0800

----------------------------------------------------------------------
 .../java/org/apache/orc/TypeDescription.java    | 123 ++++++++++++++++++-
 .../org/apache/orc/TestTypeDescription.java     |  83 +++++++++++++
 2 files changed, 205 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/orc/blob/51b6b6ce/java/core/src/java/org/apache/orc/TypeDescription.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java
index 7f4d241..86d88ff 100644
--- a/java/core/src/java/org/apache/orc/TypeDescription.java
+++ b/java/core/src/java/org/apache/orc/TypeDescription.java
@@ -275,7 +275,7 @@ public class TypeDescription
     } else {
       while (source.position < source.length) {
         char ch = source.value.charAt(source.position);
-        if (!Character.isLetterOrDigit(ch) && ch != '.' && ch != '_') {
+        if (!Character.isLetterOrDigit(ch) && ch != '_') {
           break;
         }
         source.position += 1;
@@ -925,4 +925,125 @@ public class TypeDescription
       return prev.findSubtype(goal);
     }
   }
+
+  /**
+   * Split a compound name into parts separated by '.'.
+   * @param source the string to parse into simple names
+   * @return a list of simple names from the source
+   */
+  private static List<String> splitName(StringPosition source) {
+    List<String> result = new ArrayList<>();
+    do {
+      result.add(parseName(source));
+    } while (consumeChar(source, '.'));
+    return result;
+  }
+
+  private static final Pattern INTEGER_PATTERN = Pattern.compile("^[0-9]+$");
+
+  private TypeDescription findSubtype(StringPosition source) {
+    List<String> names = splitName(source);
+    if (names.size() == 1 && INTEGER_PATTERN.matcher(names.get(0)).matches()) {
+      return findSubtype(Integer.parseInt(names.get(0)));
+    }
+    TypeDescription current = this;
+    while (names.size() > 0) {
+      String first = names.remove(0);
+      switch (current.category) {
+        case STRUCT: {
+          int posn = current.fieldNames.indexOf(first);
+          if (posn == -1) {
+            throw new IllegalArgumentException("Field " + first +
+                " not found in " + current.toString());
+          }
+          current = current.children.get(posn);
+          break;
+        }
+        case LIST:
+          if (first.equals("_elem")) {
+            current = current.getChildren().get(0);
+          } else {
+            throw new IllegalArgumentException("Field " + first +
+                "not found in " + current.toString());
+          }
+          break;
+        case MAP:
+          if (first.equals("_key")) {
+            current = current.getChildren().get(0);
+          } else if (first.equals("_value")) {
+            current = current.getChildren().get(1);
+          } else {
+            throw new IllegalArgumentException("Field " + first +
+                "not found in " + current.toString());
+          }
+          break;
+        case UNION: {
+          try {
+            int posn = Integer.parseInt(first);
+            if (posn < 0 || posn >= current.getChildren().size()) {
+              throw new NumberFormatException("off end of union");
+            }
+            current = current.getChildren().get(posn);
+          } catch (NumberFormatException e) {
+            throw new IllegalArgumentException("Field " + first +
+                "not found in " + current.toString(), e);
+          }
+          break;
+        }
+        default:
+          throw new IllegalArgumentException("Field " + first +
+              "not found in " + current.toString());
+      }
+    }
+    return current;
+  }
+
+  /**
+   * Find a subtype of this schema by name.
+   * If the name is a simple integer, it will be used as a column number.
+   * Otherwise, this routine will recursively search for the name.
+   * <ul>
+   *   <li>Struct fields are selected by name.</li>
+   *   <li>List children are selected by "_elem".</li>
+   *   <li>Map children are selected by "_key" or "_value".</li>
+   *   <li>Union children are selected by number starting at 0.</li>
+   * </ul>
+   * Names are separated by '.'.
+   * @param columnName the name to search for
+   * @return the subtype
+   */
+  public TypeDescription findSubtype(String columnName) {
+    StringPosition source = new StringPosition(columnName);
+    TypeDescription result = findSubtype(source);
+    if (source.position != source.length) {
+      throw new IllegalArgumentException("Remaining text in parsing field name "
+          + source);
+    }
+    return result;
+  }
+
+  /**
+   * Find a list of subtypes from a string, including the empty list.
+   *
+   * Each column name is separated by ','.
+   * @param columnNameList the list of column names
+   * @return the list of subtypes that correspond to the column names
+   */
+  public List<TypeDescription> findSubtypes(String columnNameList) {
+    StringPosition source = new StringPosition(columnNameList);
+    List<TypeDescription> result = new ArrayList<>();
+    boolean needComma = false;
+    while (source.position != source.length) {
+      if (needComma) {
+        if (!consumeChar(source, ',')) {
+          throw new IllegalArgumentException("Comma expected in list of column"
+              + " names at " + source);
+        }
+      } else {
+        needComma = true;
+      }
+      result.add(findSubtype(source));
+    }
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/orc/blob/51b6b6ce/java/core/src/test/org/apache/orc/TestTypeDescription.java
----------------------------------------------------------------------
diff --git a/java/core/src/test/org/apache/orc/TestTypeDescription.java b/java/core/src/test/org/apache/orc/TestTypeDescription.java
index 404c9d0..cb2e8d7 100644
--- a/java/core/src/test/org/apache/orc/TestTypeDescription.java
+++ b/java/core/src/test/org/apache/orc/TestTypeDescription.java
@@ -18,6 +18,7 @@
 package org.apache.orc;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 import org.junit.Rule;
 import org.junit.Test;
@@ -220,4 +221,86 @@ public class TestTypeDescription {
     assertEquals(0, type.getId());
     assertEquals(2, leaf.getId());
   }
+
+  @Test
+  public void testFindSubtype() {
+    TypeDescription type = TypeDescription.fromString(
+        "struct<a:int," +
+            "b:struct<c:array<int>,d:map<string,struct<e:string>>>," +
+            "f:string," +
+            "g:uniontype<string,int>>");
+    assertEquals(0, type.findSubtype("0").getId());
+    assertEquals(1, type.findSubtype("a").getId());
+    assertEquals(2, type.findSubtype("b").getId());
+    assertEquals(3, type.findSubtype("b.c").getId());
+    assertEquals(4, type.findSubtype("b.c._elem").getId());
+    assertEquals(5, type.findSubtype("b.d").getId());
+    assertEquals(6, type.findSubtype("b.d._key").getId());
+    assertEquals(7, type.findSubtype("b.d._value").getId());
+    assertEquals(8, type.findSubtype("b.d._value.e").getId());
+    assertEquals(9, type.findSubtype("f").getId());
+    assertEquals(10, type.findSubtype("g").getId());
+    assertEquals(11, type.findSubtype("g.0").getId());
+    assertEquals(12, type.findSubtype("g.1").getId());
+  }
+
+  @Test
+  public void testBadFindSubtype() {
+    TypeDescription type = TypeDescription.fromString(
+        "struct<a:int," +
+            "b:struct<c:array<int>,d:map<string,struct<e:string>>>," +
+            "f:string," +
+            "g:uniontype<string,int>>");
+    try {
+      type.findSubtype("13");
+      assertTrue(false);
+    } catch (IllegalArgumentException e) {
+      // PASS
+    }
+    try {
+      type.findSubtype("aa");
+      assertTrue(false);
+    } catch (IllegalArgumentException e) {
+      // PASS
+    }
+    try {
+      type.findSubtype("b.a");
+      assertTrue(false);
+    } catch (IllegalArgumentException e) {
+      // PASS
+    }
+    try {
+      type.findSubtype("g.2");
+      assertTrue(false);
+    } catch (IllegalArgumentException e) {
+      // PASS
+    }
+    try {
+      type.findSubtype("b.c.d");
+      assertTrue(false);
+    } catch (IllegalArgumentException e) {
+      // PASS
+    }
+  }
+
+  @Test
+  public void testFindSubtypes() {
+    TypeDescription type = TypeDescription.fromString(
+        "struct<a:int," +
+            "b:struct<c:array<int>,d:map<string,struct<e:string>>>," +
+            "f:string," +
+            "g:uniontype<string,int>>");
+    List<TypeDescription> results = type.findSubtypes("a");
+    assertEquals(1, results.size());
+    assertEquals(1, results.get(0).getId());
+
+    results = type.findSubtypes("b.d._value.e,3,g.0");
+    assertEquals(3, results.size());
+    assertEquals(8, results.get(0).getId());
+    assertEquals(3, results.get(1).getId());
+    assertEquals(11, results.get(2).getId());
+
+    results = type.findSubtypes("");
+    assertEquals(0, results.size());
+  }
 }