You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/05/28 16:28:46 UTC

svn commit: r1598023 - in /tika/trunk: ./ tika-app/src/main/java/org/apache/tika/cli/ tika-app/src/main/java/org/apache/tika/io/ tika-app/src/main/java/org/apache/tika/io/json/ tika-app/src/test/java/org/apache/tika/cli/ tika-app/src/test/resources/tes...

Author: tallison
Date: Wed May 28 14:28:46 2014
New Revision: 1598023

URL: http://svn.apache.org/r1598023
Log:
TIKA-1291/TIKA-1310 fix bug in JSON output from CLI

Added:
    tika/trunk/tika-app/src/main/java/org/apache/tika/io/
    tika/trunk/tika-app/src/main/java/org/apache/tika/io/json/
    tika/trunk/tika-app/src/main/java/org/apache/tika/io/json/JsonMetadataSerializer.java
    tika/trunk/tika-app/src/test/resources/test-data/testJsonMultipleInts.html
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
    tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1598023&r1=1598022&r2=1598023&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Wed May 28 14:28:46 2014
@@ -1,5 +1,7 @@
 Release 1.6 - ??/??/2014
 
+  * Fixed bug in CLI json output (TIKA-1291/TIKA-1310)
+
   * Added ability to turn off image extraction from PDFs (TIKA-1294).
     Users must now turn on this capability via the PDFParserConfig.
 

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1598023&r1=1598022&r2=1598023&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Wed May 28 14:28:46 2014
@@ -31,8 +31,6 @@ import java.net.ServerSocket;
 import java.net.Socket;
 import java.net.URI;
 import java.net.URL;
-import java.text.NumberFormat;
-import java.text.ParsePosition;
 import java.util.Arrays;
 import java.util.Comparator;
 import java.util.HashMap;
@@ -47,6 +45,9 @@ import javax.xml.transform.sax.SAXTransf
 import javax.xml.transform.sax.TransformerHandler;
 import javax.xml.transform.stream.StreamResult;
 
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.log4j.BasicConfigurator;
@@ -70,6 +71,7 @@ import org.apache.tika.gui.TikaGUI;
 import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.io.IOUtils;
 import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.io.json.JsonMetadataSerializer;
 import org.apache.tika.language.LanguageProfilerBuilder;
 import org.apache.tika.language.ProfilingHandler;
 import org.apache.tika.metadata.Metadata;
@@ -90,7 +92,6 @@ import org.apache.tika.xmp.XMPMetadata;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
-import com.google.gson.Gson;
 import org.apache.tika.io.FilenameUtils;
 
 /**
@@ -898,68 +899,30 @@ public class TikaCLI {
         	}
         }
     }
-    
+
     /**
-     * Uses GSON to do the JSON escaping, but does
-     *  the general JSON glueing ourselves.
+     * Uses GSON. 
      */
-    private class NoDocumentJSONMetHandler extends NoDocumentMetHandler {
-        private NumberFormat formatter;
-        private Gson gson;
-       
+    private class NoDocumentJSONMetHandler extends DefaultHandler {
+
+        private final Gson gson;
+
+        protected final Metadata metadata;
+        
+        protected PrintWriter writer;
+
         public NoDocumentJSONMetHandler(Metadata metadata, PrintWriter writer){
-            super(metadata, writer);
-            
-            formatter = NumberFormat.getInstance();
-            gson = new Gson();
+            this.metadata = metadata;
+            this.writer = writer;
+            GsonBuilder builder = new GsonBuilder();
+            builder.registerTypeHierarchyAdapter(Metadata.class, new JsonMetadataSerializer());
+            gson = builder.create();
         }
         
         @Override
-        public void outputMetadata(String[] names) {
-           writer.print("{ ");
-           boolean first = true;
-           for (String name : names) {
-              if(! first) {
-                 writer.println(", ");
-              } else {
-                 first = false;
-              }
-              gson.toJson(name, writer);
-              writer.print(":");
-              outputValues(metadata.getValues(name));
-           }
-           writer.print(" }");
-        }
-        
-        public void outputValues(String[] values) {
-           if(values.length > 1) {
-              writer.print("[");
-           }
-           for(int i=0; i<values.length; i++) {
-              String value = values[i];
-              if(i > 0) {
-                 writer.print(", ");
-              }
-              
-              if(value == null || value.length() == 0) {
-                 writer.print("null");
-              } else {
-                 // Is it a number?
-                 ParsePosition pos = new ParsePosition(0);
-                 formatter.parse(value, pos);
-                 if(value.length() == pos.getIndex()) {
-                    // It's a number. Remove leading zeros and output
-                    value = value.replaceFirst("^0+(\\d)", "$1");
-                    writer.print(value);
-                 } else {
-                    // Not a number, escape it
-                    gson.toJson(value, writer);
-                 }
-              }
-           }
-           if(values.length > 1) {
-              writer.print("]");
-           }
-        }
-    }
+        public void endDocument() throws SAXException {
+                gson.toJson(metadata, writer);
+                writer.flush();
+        }   
+    }    
 }

Added: tika/trunk/tika-app/src/main/java/org/apache/tika/io/json/JsonMetadataSerializer.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/io/json/JsonMetadataSerializer.java?rev=1598023&view=auto
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/io/json/JsonMetadataSerializer.java (added)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/io/json/JsonMetadataSerializer.java Wed May 28 14:28:46 2014
@@ -0,0 +1,89 @@
+package org.apache.tika.io.json;
+
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+import java.lang.reflect.Type;
+import java.util.Arrays;
+
+import org.apache.tika.metadata.Metadata;
+
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonNull;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonPrimitive;
+import com.google.gson.JsonSerializationContext;
+import com.google.gson.JsonSerializer;
+
+
+public class JsonMetadataSerializer implements JsonSerializer<Metadata> {
+
+    /**
+     * 
+     * @param metadata
+     * @param type
+     * @param context
+     * @return JsonObject with key/value(s) pairs or JsonNull if metadata is null.
+     */
+    @Override
+    public JsonElement serialize(Metadata metadata, Type type, JsonSerializationContext context) {
+        if (metadata == null){
+            return new JsonNull();
+        }
+        String[] names = getNames(metadata);
+        if (names == null) {
+            return new JsonNull();
+        }
+
+        JsonObject root = new JsonObject();
+
+        for (String n : names) {
+            
+            String[] vals = metadata.getValues(n);
+            if (vals == null) {
+                //silently skip?
+                continue;
+            }
+            
+            if (vals.length == 1) {
+                root.addProperty(n, vals[0]);
+            } else {
+                JsonArray jArr = new JsonArray();
+                for (int i = 0; i < vals.length; i++) {
+                    jArr.add(new JsonPrimitive(vals[i]));
+                }
+                root.add(n, jArr);
+            }
+        }
+        return root;
+    }
+    
+    /**
+     * Override to get a custom sort order
+     * or to filter names.
+     * 
+     * @param metadata
+     * @return
+     */
+    protected String[] getNames(Metadata metadata) {
+        String[] names = metadata.names();
+        Arrays.sort(names);
+        return names;
+    }
+
+}

Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1598023&r1=1598022&r2=1598023&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java (original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Wed May 28 14:28:46 2014
@@ -135,6 +135,27 @@ public class TikaCLITest {
     }
 
     /**
+     * Basic tests for -json option
+     * 
+     * @throws Exception
+     */
+    @Test
+    public void testJsonMetadataOutput() throws Exception {
+        String[] params = {"--json", resourcePrefix + "testJsonMultipleInts.html"};
+        TikaCLI.main(params);
+        String json = outContent.toString();
+        //TIKA-1310
+        assertTrue(json.contains("\"fb:admins\":\"1,2,3,4\","));
+        
+        //test legacy alphabetic sort of keys
+        int enc = json.indexOf("\"Content-Encoding\"");
+        int fb = json.indexOf("fb:admins");
+        int title = json.indexOf("\"title\"");
+        assertTrue(enc > -1 && fb > -1 && enc < fb);
+        assertTrue (fb > -1 && title > -1 && fb < title);
+    }
+
+    /**
      * Tests -l option of the cli
      * 
      * @throws Exception

Added: tika/trunk/tika-app/src/test/resources/test-data/testJsonMultipleInts.html
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/resources/test-data/testJsonMultipleInts.html?rev=1598023&view=auto
==============================================================================
--- tika/trunk/tika-app/src/test/resources/test-data/testJsonMultipleInts.html (added)
+++ tika/trunk/tika-app/src/test/resources/test-data/testJsonMultipleInts.html Wed May 28 14:28:46 2014
@@ -0,0 +1,9 @@
+<html>
+<head>
+	<meta property="fb:admins" content="1,2,3,4" />
+	<title>this is a title</title>
+<head>
+<body>
+body
+</body>
+</html>
\ No newline at end of file