You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/05/28 16:28:46 UTC
svn commit: r1598023 - in /tika/trunk: ./
tika-app/src/main/java/org/apache/tika/cli/
tika-app/src/main/java/org/apache/tika/io/
tika-app/src/main/java/org/apache/tika/io/json/
tika-app/src/test/java/org/apache/tika/cli/
tika-app/src/test/resources/tes...
Author: tallison
Date: Wed May 28 14:28:46 2014
New Revision: 1598023
URL: http://svn.apache.org/r1598023
Log:
TIKA-1291/TIKA-1310 fix bug in JSON output from CLI
Added:
tika/trunk/tika-app/src/main/java/org/apache/tika/io/
tika/trunk/tika-app/src/main/java/org/apache/tika/io/json/
tika/trunk/tika-app/src/main/java/org/apache/tika/io/json/JsonMetadataSerializer.java
tika/trunk/tika-app/src/test/resources/test-data/testJsonMultipleInts.html
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1598023&r1=1598022&r2=1598023&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Wed May 28 14:28:46 2014
@@ -1,5 +1,7 @@
Release 1.6 - ??/??/2014
+ * Fixed bug in CLI json output (TIKA-1291/TIKA-1310)
+
* Added ability to turn off image extraction from PDFs (TIKA-1294).
Users must now turn on this capability via the PDFParserConfig.
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1598023&r1=1598022&r2=1598023&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Wed May 28 14:28:46 2014
@@ -31,8 +31,6 @@ import java.net.ServerSocket;
import java.net.Socket;
import java.net.URI;
import java.net.URL;
-import java.text.NumberFormat;
-import java.text.ParsePosition;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
@@ -47,6 +45,9 @@ import javax.xml.transform.sax.SAXTransf
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.log4j.BasicConfigurator;
@@ -70,6 +71,7 @@ import org.apache.tika.gui.TikaGUI;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.io.IOUtils;
import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.io.json.JsonMetadataSerializer;
import org.apache.tika.language.LanguageProfilerBuilder;
import org.apache.tika.language.ProfilingHandler;
import org.apache.tika.metadata.Metadata;
@@ -90,7 +92,6 @@ import org.apache.tika.xmp.XMPMetadata;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-import com.google.gson.Gson;
import org.apache.tika.io.FilenameUtils;
/**
@@ -898,68 +899,30 @@ public class TikaCLI {
}
}
}
-
+
/**
- * Uses GSON to do the JSON escaping, but does
- * the general JSON glueing ourselves.
+ * Uses GSON.
*/
- private class NoDocumentJSONMetHandler extends NoDocumentMetHandler {
- private NumberFormat formatter;
- private Gson gson;
-
+ private class NoDocumentJSONMetHandler extends DefaultHandler {
+
+ private final Gson gson;
+
+ protected final Metadata metadata;
+
+ protected PrintWriter writer;
+
public NoDocumentJSONMetHandler(Metadata metadata, PrintWriter writer){
- super(metadata, writer);
-
- formatter = NumberFormat.getInstance();
- gson = new Gson();
+ this.metadata = metadata;
+ this.writer = writer;
+ GsonBuilder builder = new GsonBuilder();
+ builder.registerTypeHierarchyAdapter(Metadata.class, new JsonMetadataSerializer());
+ gson = builder.create();
}
@Override
- public void outputMetadata(String[] names) {
- writer.print("{ ");
- boolean first = true;
- for (String name : names) {
- if(! first) {
- writer.println(", ");
- } else {
- first = false;
- }
- gson.toJson(name, writer);
- writer.print(":");
- outputValues(metadata.getValues(name));
- }
- writer.print(" }");
- }
-
- public void outputValues(String[] values) {
- if(values.length > 1) {
- writer.print("[");
- }
- for(int i=0; i<values.length; i++) {
- String value = values[i];
- if(i > 0) {
- writer.print(", ");
- }
-
- if(value == null || value.length() == 0) {
- writer.print("null");
- } else {
- // Is it a number?
- ParsePosition pos = new ParsePosition(0);
- formatter.parse(value, pos);
- if(value.length() == pos.getIndex()) {
- // It's a number. Remove leading zeros and output
- value = value.replaceFirst("^0+(\\d)", "$1");
- writer.print(value);
- } else {
- // Not a number, escape it
- gson.toJson(value, writer);
- }
- }
- }
- if(values.length > 1) {
- writer.print("]");
- }
- }
- }
+ public void endDocument() throws SAXException {
+ gson.toJson(metadata, writer);
+ writer.flush();
+ }
+ }
}
Added: tika/trunk/tika-app/src/main/java/org/apache/tika/io/json/JsonMetadataSerializer.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/io/json/JsonMetadataSerializer.java?rev=1598023&view=auto
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/io/json/JsonMetadataSerializer.java (added)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/io/json/JsonMetadataSerializer.java Wed May 28 14:28:46 2014
@@ -0,0 +1,89 @@
+package org.apache.tika.io.json;
+
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+import java.lang.reflect.Type;
+import java.util.Arrays;
+
+import org.apache.tika.metadata.Metadata;
+
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonNull;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonPrimitive;
+import com.google.gson.JsonSerializationContext;
+import com.google.gson.JsonSerializer;
+
+
+public class JsonMetadataSerializer implements JsonSerializer<Metadata> {
+
+ /**
+ *
+ * @param metadata
+ * @param type
+ * @param context
+ * @return JsonObject with key/value(s) pairs or JsonNull if metadata is null.
+ */
+ @Override
+ public JsonElement serialize(Metadata metadata, Type type, JsonSerializationContext context) {
+ if (metadata == null){
+ return new JsonNull();
+ }
+ String[] names = getNames(metadata);
+ if (names == null) {
+ return new JsonNull();
+ }
+
+ JsonObject root = new JsonObject();
+
+ for (String n : names) {
+
+ String[] vals = metadata.getValues(n);
+ if (vals == null) {
+ //silently skip?
+ continue;
+ }
+
+ if (vals.length == 1) {
+ root.addProperty(n, vals[0]);
+ } else {
+ JsonArray jArr = new JsonArray();
+ for (int i = 0; i < vals.length; i++) {
+ jArr.add(new JsonPrimitive(vals[i]));
+ }
+ root.add(n, jArr);
+ }
+ }
+ return root;
+ }
+
+ /**
+ * Override to get a custom sort order
+ * or to filter names.
+ *
+ * @param metadata
+ * @return
+ */
+ protected String[] getNames(Metadata metadata) {
+ String[] names = metadata.names();
+ Arrays.sort(names);
+ return names;
+ }
+
+}
Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1598023&r1=1598022&r2=1598023&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java (original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Wed May 28 14:28:46 2014
@@ -135,6 +135,27 @@ public class TikaCLITest {
}
/**
+ * Basic tests for -json option
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testJsonMetadataOutput() throws Exception {
+ String[] params = {"--json", resourcePrefix + "testJsonMultipleInts.html"};
+ TikaCLI.main(params);
+ String json = outContent.toString();
+ //TIKA-1310
+ assertTrue(json.contains("\"fb:admins\":\"1,2,3,4\","));
+
+ //test legacy alphabetic sort of keys
+ int enc = json.indexOf("\"Content-Encoding\"");
+ int fb = json.indexOf("fb:admins");
+ int title = json.indexOf("\"title\"");
+ assertTrue(enc > -1 && fb > -1 && enc < fb);
+ assertTrue (fb > -1 && title > -1 && fb < title);
+ }
+
+ /**
* Tests -l option of the cli
*
* @throws Exception
Added: tika/trunk/tika-app/src/test/resources/test-data/testJsonMultipleInts.html
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/resources/test-data/testJsonMultipleInts.html?rev=1598023&view=auto
==============================================================================
--- tika/trunk/tika-app/src/test/resources/test-data/testJsonMultipleInts.html (added)
+++ tika/trunk/tika-app/src/test/resources/test-data/testJsonMultipleInts.html Wed May 28 14:28:46 2014
@@ -0,0 +1,9 @@
+<html>
+<head>
+ <meta property="fb:admins" content="1,2,3,4" />
+ <title>this is a title</title>
+<head>
+<body>
+body
+</body>
+</html>
\ No newline at end of file