You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by na...@apache.org on 2012/08/31 08:42:51 UTC

svn commit: r1379313 - in /hive/trunk/ql/src: java/org/apache/hadoop/hive/ql/udf/UDFJson.java java/org/apache/hadoop/hive/ql/udf/generic/GenericUDTFJSONTuple.java test/results/clientpositive/udf_get_json_object.q.out

Author: namit
Date: Fri Aug 31 06:42:51 2012
New Revision: 1379313

URL: http://svn.apache.org/viewvc?rev=1379313&view=rev
Log:
HIVE-3393 get_json_object and json_tuple should use Jackson library
(Kevin Wilfong via namit)


Modified:
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFJson.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDTFJSONTuple.java
    hive/trunk/ql/src/test/results/clientpositive/udf_get_json_object.q.out

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFJson.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFJson.java?rev=1379313&r1=1379312&r2=1379313&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFJson.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFJson.java Fri Aug 31 06:42:51 2012
@@ -21,6 +21,7 @@ package org.apache.hadoop.hive.ql.udf;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -28,9 +29,9 @@ import java.util.regex.Pattern;
 import org.apache.hadoop.hive.ql.exec.Description;
 import org.apache.hadoop.hive.ql.exec.UDF;
 import org.apache.hadoop.io.Text;
-import org.json.JSONArray;
-import org.json.JSONException;
-import org.json.JSONObject;
+import org.codehaus.jackson.map.ObjectMapper;
+import org.codehaus.jackson.map.type.TypeFactory;
+import org.codehaus.jackson.type.JavaType;
 
 /**
  * UDFJson.
@@ -58,6 +59,9 @@ public class UDFJson extends UDF {
   private final Pattern patternKey = Pattern.compile("^([a-zA-Z0-9_\\-\\:\\s]+).*");
   private final Pattern patternIndex = Pattern.compile("\\[([0-9]+|\\*)\\]");
 
+  private static final ObjectMapper MAPPER = new ObjectMapper();
+  private static final JavaType MAP_TYPE = TypeFactory.fromClass(Map.class);
+
   // An LRU cache using a linked hash map
   static class HashCache<K, V> extends LinkedHashMap<K, V> {
 
@@ -80,7 +84,8 @@ public class UDFJson extends UDF {
 
   static Map<String, Object> extractObjectCache = new HashCache<String, Object>();
   static Map<String, String[]> pathExprCache = new HashCache<String, String[]>();
-  static Map<String, ArrayList<String>> indexListCache = new HashCache<String, ArrayList<String>>();
+  static Map<String, ArrayList<String>> indexListCache =
+      new HashCache<String, ArrayList<String>>();
   static Map<String, String> mKeyGroup1Cache = new HashCache<String, String>();
   static Map<String, Boolean> mKeyMatchesCache = new HashCache<String, Boolean>();
 
@@ -115,34 +120,47 @@ public class UDFJson extends UDF {
       return null;
     }
 
-    try {
-      // Cache pathExpr
-      String[] pathExpr = pathExprCache.get(pathString);
-      if (pathExpr == null) {
-        pathExpr = pathString.split("\\.", -1);
-        pathExprCache.put(pathString, pathExpr);
-      }
+    // Cache pathExpr
+    String[] pathExpr = pathExprCache.get(pathString);
+    if (pathExpr == null) {
+      pathExpr = pathString.split("\\.", -1);
+      pathExprCache.put(pathString, pathExpr);
+    }
 
-      if (!pathExpr[0].equalsIgnoreCase("$")) {
+    if (!pathExpr[0].equalsIgnoreCase("$")) {
+      return null;
+    }
+    // Cache extractObject
+    Object extractObject = extractObjectCache.get(jsonString);
+    if (extractObject == null) {
+      try {
+        extractObject = MAPPER.readValue(jsonString, MAP_TYPE);
+      } catch (Exception e) {
         return null;
       }
-      // Cache extractObject
-      Object extractObject = extractObjectCache.get(jsonString);
+      extractObjectCache.put(jsonString, extractObject);
+    }
+    for (int i = 1; i < pathExpr.length; i++) {
       if (extractObject == null) {
-        extractObject = new JSONObject(jsonString);
-        extractObjectCache.put(jsonString, extractObject);
+          return null;
       }
-      for (int i = 1; i < pathExpr.length; i++) {
-        extractObject = extract(extractObject, pathExpr[i]);
+      extractObject = extract(extractObject, pathExpr[i]);
+    }
+    if (extractObject instanceof Map || extractObject instanceof List) {
+      try {
+        result.set(MAPPER.writeValueAsString(extractObject));
+      } catch (Exception e) {
+        return null;
       }
+    } else if (extractObject != null) {
       result.set(extractObject.toString());
-      return result;
-    } catch (Exception e) {
+    } else {
       return null;
     }
+    return result;
   }
 
-  private Object extract(Object json, String path) throws JSONException {
+  private Object extract(Object json, String path) {
 
     // Cache patternkey.matcher(path).matches()
     Matcher mKey = null;
@@ -185,68 +203,73 @@ public class UDFJson extends UDF {
     return json;
   }
 
-  ArrayList<Object> jsonList = new ArrayList<Object>();
+  List<Object> jsonList = new ArrayList<Object>();
 
-  private Object extract_json_withindex(Object json, ArrayList<String> indexList)
-      throws JSONException {
+  @SuppressWarnings("unchecked")
+  private Object extract_json_withindex(Object json, ArrayList<String> indexList) {
 
     jsonList.clear();
     jsonList.add(json);
     Iterator<String> itr = indexList.iterator();
     while (itr.hasNext()) {
       String index = itr.next();
-      ArrayList<Object> tmp_jsonList = new ArrayList<Object>();
+      List<Object> tmp_jsonList = new ArrayList<Object>();
       if (index.equalsIgnoreCase("*")) {
-        for (int i = 0; i < (jsonList).size(); i++) {
-          try {
-            JSONArray array = (JSONArray) (jsonList).get(i);
-            for (int j = 0; j < array.length(); j++) {
-              tmp_jsonList.add(array.get(j));
+        for (int i = 0; i < jsonList.size(); i++) {
+          Object array = jsonList.get(i);
+          if (array instanceof List) {
+            for (int j = 0; j < ((List<Object>)array).size(); j++) {
+              tmp_jsonList.add(((List<Object>)array).get(j));
             }
-          } catch (Exception e) {
-            continue;
           }
         }
         jsonList = tmp_jsonList;
       } else {
         for (int i = 0; i < (jsonList).size(); i++) {
-          try {
-            tmp_jsonList.add(((JSONArray) (jsonList).get(i)).get(Integer
-                .parseInt(index)));
-          } catch (ClassCastException e) {
+          Object array = jsonList.get(i);
+          int indexValue = Integer.parseInt(index);
+          if (!(array instanceof List)) {
             continue;
-          } catch (JSONException e) {
+          }
+          if (indexValue >= ((List<Object>)array).size()) {
             return null;
           }
+          tmp_jsonList.add(((List<Object>)array).get(indexValue));
           jsonList = tmp_jsonList;
         }
       }
     }
-    return (jsonList.size() > 1) ? new JSONArray(jsonList) : jsonList.get(0);
+    if (jsonList.isEmpty()) {
+      return null;
+    }
+    return (jsonList.size() > 1) ? new ArrayList<Object>(jsonList) : jsonList.get(0);
   }
 
-  private Object extract_json_withkey(Object json, String path)
-      throws JSONException {
-    if (json.getClass() == org.json.JSONArray.class) {
-      JSONArray jsonArray = new JSONArray();
-      for (int i = 0; i < ((JSONArray) json).length(); i++) {
-        Object josn_elem = ((JSONArray) json).get(i);
-        try {
-          Object json_obj = ((JSONObject) josn_elem).get(path);
-          if (json_obj.getClass() == org.json.JSONArray.class) {
-            for (int j = 0; j < ((JSONArray) json_obj).length(); j++) {
-              jsonArray.put(((JSONArray) json_obj).get(j));
-            }
-          } else {
-            jsonArray.put(json_obj);
-          }
-        } catch (Exception e) {
+  @SuppressWarnings("unchecked")
+  private Object extract_json_withkey(Object json, String path) {
+    if (json instanceof List) {
+      List<Object> jsonArray = new ArrayList<Object>();
+      for (int i = 0; i < ((List<Object>) json).size(); i++) {
+        Object json_elem = ((List<Object>) json).get(i);
+        Object json_obj = null;
+        if (json_elem instanceof Map) {
+          json_obj = ((Map<String, Object>) json_elem).get(path);
+        } else {
           continue;
         }
+        if (json_obj instanceof List) {
+          for (int j = 0; j < ((List<Object>) json_obj).size(); j++) {
+            jsonArray.add(((List<Object>) json_obj).get(j));
+          }
+        } else if (json_obj != null) {
+          jsonArray.add(json_obj);
+        }
       }
-      return (jsonArray.length() == 0) ? null : jsonArray;
+      return (jsonArray.size() == 0) ? null : jsonArray;
+    } else if (json instanceof Map) {
+      return ((Map<String, Object>) json).get(path);
     } else {
-      return ((JSONObject) json).get(path);
+      return null;
     }
   }
 }

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDTFJSONTuple.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDTFJSONTuple.java?rev=1379313&r1=1379312&r2=1379313&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDTFJSONTuple.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDTFJSONTuple.java Fri Aug 31 06:42:51 2012
@@ -20,6 +20,7 @@ package org.apache.hadoop.hive.ql.udf.ge
 
 import java.util.ArrayList;
 import java.util.LinkedHashMap;
+import java.util.List;
 import java.util.Map;
 
 import org.apache.commons.logging.Log;
@@ -34,8 +35,10 @@ import org.apache.hadoop.hive.serde2.obj
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
 import org.apache.hadoop.io.Text;
-import org.json.JSONException;
-import org.json.JSONObject;
+import org.codehaus.jackson.map.ObjectMapper;
+import org.codehaus.jackson.map.type.TypeFactory;
+import org.codehaus.jackson.type.JavaType;
+
 /**
  * GenericUDTFJSONTuple: this
  *
@@ -48,6 +51,9 @@ public class GenericUDTFJSONTuple extend
 
   private static Log LOG = LogFactory.getLog(GenericUDTFJSONTuple.class.getName());
 
+  private static final ObjectMapper MAPPER = new ObjectMapper();
+  private static final JavaType MAP_TYPE = TypeFactory.fromClass(Map.class);
+
   int numCols;    // number of output columns
   String[] paths; // array of path expressions, each of which corresponds to a column
   Text[] retCols; // array of returned column values
@@ -77,7 +83,7 @@ public class GenericUDTFJSONTuple extend
 
   }
 
-  static Map<String, JSONObject> jsonObjectCache = new HashCache<String, JSONObject>();
+  static Map<String, Object> jsonObjectCache = new HashCache<String, Object>();
 
   @Override
   public void close() throws HiveException {
@@ -127,6 +133,7 @@ public class GenericUDTFJSONTuple extend
     return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
   }
 
+  @SuppressWarnings("unchecked")
   @Override
   public void process(Object[] o) throws HiveException {
 
@@ -148,32 +155,39 @@ public class GenericUDTFJSONTuple extend
       return;
     }
     try {
-      JSONObject jsonObj = jsonObjectCache.get(jsonStr);
+      Object jsonObj = jsonObjectCache.get(jsonStr);
       if (jsonObj == null) {
-        jsonObj = new JSONObject(jsonStr);
+        try {
+          jsonObj = MAPPER.readValue(jsonStr, MAP_TYPE);
+        } catch (Exception e) {
+          reportInvalidJson(jsonStr);
+          forward(nullCols);
+          return;
+        }
         jsonObjectCache.put(jsonStr, jsonObj);
       }
 
+      if (!(jsonObj instanceof Map)) {
+        reportInvalidJson(jsonStr);
+        forward(nullCols);
+        return;
+      }
+
       for (int i = 0; i < numCols; ++i) {
-        if (jsonObj.isNull(paths[i])) {
-          retCols[i] = null;
+        if (retCols[i] == null) {
+          retCols[i] = cols[i]; // use the object pool rather than creating a new object
+        }
+        Object extractObject = ((Map<String, Object>)jsonObj).get(paths[i]);
+        if (extractObject instanceof Map || extractObject instanceof List) {
+          retCols[i].set(MAPPER.writeValueAsString(extractObject));
+        } else if (extractObject != null) {
+          retCols[i].set(extractObject.toString());
         } else {
-          if (retCols[i] == null) {
-            retCols[i] = cols[i]; // use the object pool rather than creating a new object
-          }
-          retCols[i].set(jsonObj.getString(paths[i]));
+          retCols[i] = null;
         }
       }
       forward(retCols);
       return;
-    } catch (JSONException e) {
-      // parsing error, invalid JSON string
-      if (!seenErrors) {
-        LOG.error("The input is not a valid JSON string: " + jsonStr + ". Skipping such error messages in the future.");
-        seenErrors = true;
-      }
-      forward(nullCols);
-      return;
     } catch (Throwable e) {
       LOG.error("JSON parsing/evaluation exception" + e);
       forward(nullCols);
@@ -184,4 +198,12 @@ public class GenericUDTFJSONTuple extend
   public String toString() {
     return "json_tuple";
   }
+
+  private void reportInvalidJson(String jsonStr) {
+    if (!seenErrors) {
+      LOG.error("The input is not a valid JSON string: " + jsonStr +
+          ". Skipping such error messages in the future.");
+      seenErrors = true;
+    }
+  }
 }

Modified: hive/trunk/ql/src/test/results/clientpositive/udf_get_json_object.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/udf_get_json_object.q.out?rev=1379313&r1=1379312&r2=1379313&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/udf_get_json_object.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/udf_get_json_object.q.out Fri Aug 31 06:42:51 2012
@@ -92,7 +92,7 @@ POSTHOOK: type: QUERY
 POSTHOOK: Input: default@src_json
 #### A masked pattern was here ####
 POSTHOOK: Lineage: dest1.c1 SIMPLE []
-amy	{"fruit":[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],"book":[{"author":"Nigel Rees","category":"reference","title":"Sayings of the Century","price":8.95},{"author":"Herman Melville","category":"fiction","title":"Moby Dick","price":8.99,"isbn":"0-553-21311-3"},{"author":"J. R. R. Tolkien","category":"fiction","title":"The Lord of the Rings","price":22.99,"reader":[{"name":"bob","age":25},{"name":"jack","age":26}],"isbn":"0-395-19395-8"}],"basket":[[1,2,{"b":"y","a":"x"}],[3,4],[5,6]],"bicycle":{"price":19.95,"color":"red"}}
+amy	{"fruit":[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],"basket":[[1,2,{"b":"y","a":"x"}],[3,4],[5,6]],"book":[{"author":"Nigel Rees","title":"Sayings of the Century","category":"reference","price":8.95},{"author":"Herman Melville","title":"Moby Dick","category":"fiction","price":8.99,"isbn":"0-553-21311-3"},{"author":"J. R. R. Tolkien","title":"The Lord of the Rings","category":"fiction","reader":[{"age":25,"name":"bob"},{"age":26,"name":"jack"}],"price":22.99,"isbn":"0-395-19395-8"}],"bicycle":{"price":19.95,"color":"red"}}
 PREHOOK: query: SELECT get_json_object(src_json.json, '$.store.bicycle'), get_json_object(src_json.json, '$.store.book') FROM src_json
 PREHOOK: type: QUERY
 PREHOOK: Input: default@src_json
@@ -102,7 +102,7 @@ POSTHOOK: type: QUERY
 POSTHOOK: Input: default@src_json
 #### A masked pattern was here ####
 POSTHOOK: Lineage: dest1.c1 SIMPLE []
-{"price":19.95,"color":"red"}	[{"author":"Nigel Rees","category":"reference","title":"Sayings of the Century","price":8.95},{"author":"Herman Melville","category":"fiction","title":"Moby Dick","price":8.99,"isbn":"0-553-21311-3"},{"author":"J. R. R. Tolkien","category":"fiction","title":"The Lord of the Rings","price":22.99,"reader":[{"name":"bob","age":25},{"name":"jack","age":26}],"isbn":"0-395-19395-8"}]
+{"price":19.95,"color":"red"}	[{"author":"Nigel Rees","title":"Sayings of the Century","category":"reference","price":8.95},{"author":"Herman Melville","title":"Moby Dick","category":"fiction","price":8.99,"isbn":"0-553-21311-3"},{"author":"J. R. R. Tolkien","title":"The Lord of the Rings","category":"fiction","reader":[{"age":25,"name":"bob"},{"age":26,"name":"jack"}],"price":22.99,"isbn":"0-395-19395-8"}]
 PREHOOK: query: SELECT get_json_object(src_json.json, '$.store.book[0]'), get_json_object(src_json.json, '$.store.book[*]') FROM src_json
 PREHOOK: type: QUERY
 PREHOOK: Input: default@src_json
@@ -112,7 +112,7 @@ POSTHOOK: type: QUERY
 POSTHOOK: Input: default@src_json
 #### A masked pattern was here ####
 POSTHOOK: Lineage: dest1.c1 SIMPLE []
-{"author":"Nigel Rees","category":"reference","title":"Sayings of the Century","price":8.95}	[{"author":"Nigel Rees","category":"reference","title":"Sayings of the Century","price":8.95},{"author":"Herman Melville","category":"fiction","title":"Moby Dick","price":8.99,"isbn":"0-553-21311-3"},{"author":"J. R. R. Tolkien","category":"fiction","title":"The Lord of the Rings","price":22.99,"reader":[{"name":"bob","age":25},{"name":"jack","age":26}],"isbn":"0-395-19395-8"}]
+{"author":"Nigel Rees","title":"Sayings of the Century","category":"reference","price":8.95}	[{"author":"Nigel Rees","title":"Sayings of the Century","category":"reference","price":8.95},{"author":"Herman Melville","title":"Moby Dick","category":"fiction","price":8.99,"isbn":"0-553-21311-3"},{"author":"J. R. R. Tolkien","title":"The Lord of the Rings","category":"fiction","reader":[{"age":25,"name":"bob"},{"age":26,"name":"jack"}],"price":22.99,"isbn":"0-395-19395-8"}]
 PREHOOK: query: SELECT get_json_object(src_json.json, '$.store.book[0].category'), get_json_object(src_json.json, '$.store.book[*].category'), get_json_object(src_json.json, '$.store.book[*].isbn'), get_json_object(src_json.json, '$.store.book[*].reader') FROM src_json
 PREHOOK: type: QUERY
 PREHOOK: Input: default@src_json
@@ -122,7 +122,7 @@ POSTHOOK: type: QUERY
 POSTHOOK: Input: default@src_json
 #### A masked pattern was here ####
 POSTHOOK: Lineage: dest1.c1 SIMPLE []
-reference	["reference","fiction","fiction"]	["0-553-21311-3","0-395-19395-8"]	[{"name":"bob","age":25},{"name":"jack","age":26}]
+reference	["reference","fiction","fiction"]	["0-553-21311-3","0-395-19395-8"]	[{"age":25,"name":"bob"},{"age":26,"name":"jack"}]
 PREHOOK: query: SELECT get_json_object(src_json.json, '$.store.book[*].reader[0].age'), get_json_object(src_json.json, '$.store.book[*].reader[*].age')  FROM src_json
 PREHOOK: type: QUERY
 PREHOOK: Input: default@src_json