You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/10/22 02:31:38 UTC
svn commit: r1633499 - in /tika/trunk: ./
tika-app/src/main/java/org/apache/tika/cli/
tika-app/src/main/java/org/apache/tika/gui/
tika-app/src/test/java/org/apache/tika/cli/
tika-app/src/test/resources/test-data/
tika-serialization/src/main/java/org/ap...
Author: tallison
Date: Wed Oct 22 00:31:37 2014
New Revision: 1633499
URL: http://svn.apache.org/r1633499
Log:
TIKA-1451 add RecursiveParserWrapper output to CLI and GUI
Added:
tika/trunk/tika-app/src/test/resources/test-data/test_recursive_embedded.docx (with props)
tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataBase.java
tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java
tika/trunk/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonMetadataListTest.java
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadata.java
tika/trunk/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonMetadataTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1633499&r1=1633498&r2=1633499&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Wed Oct 22 00:31:37 2014
@@ -1,12 +1,15 @@
Release 1.7 - Current Development
+ * Tika CLI and GUI now have option to view JSON rendering of output
+ of RecursiveParserWrapper (TIKA-1451).
+
* Tika now integrates the Geospatial Data Abstraction Library
(GDAL) for parsing hundreds of geospatial formats (TIKA-605).
* ExternalParsers can now use Regexs to specify dynamic keys
(TIKA-1441).
- * Thread safety issue in ImageMetadataExtractor were resolved
+ * Thread safety issues in ImageMetadataExtractor were resolved
(TIKA-1369).
* The ForkParser service is now registered in Activator
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1633499&r1=1633498&r2=1633499&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Wed Oct 22 00:31:37 2014
@@ -16,6 +16,37 @@
*/
package org.apache.tika.cli;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintStream;
+import java.io.PrintWriter;
+import java.io.UnsupportedEncodingException;
+import java.io.Writer;
+import java.lang.reflect.Field;
+import java.net.ServerSocket;
+import java.net.Socket;
+import java.net.URI;
+import java.net.URL;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.log4j.BasicConfigurator;
@@ -44,6 +75,7 @@ import org.apache.tika.language.Language
import org.apache.tika.language.ProfilingHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.serialization.JsonMetadata;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.mime.MimeTypeException;
@@ -54,45 +86,17 @@ import org.apache.tika.parser.ParseConte
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.apache.tika.xmp.XMPMetadata;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.TransformerConfigurationException;
-import javax.xml.transform.sax.SAXTransformerFactory;
-import javax.xml.transform.sax.TransformerHandler;
-import javax.xml.transform.stream.StreamResult;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.io.PrintStream;
-import java.io.PrintWriter;
-import java.io.UnsupportedEncodingException;
-import java.io.Writer;
-import java.lang.reflect.Field;
-import java.net.ServerSocket;
-import java.net.Socket;
-import java.net.URI;
-import java.net.URL;
-import java.nio.charset.Charset;
-import java.util.Arrays;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Set;
-
/**
* Simple command line interface for Apache Tika.
*/
@@ -281,6 +285,8 @@ public class TikaCLI {
private String configFilePath;
private OutputType type = XML;
+
+ private boolean recursiveJSON = false;
private LanguageProfilerBuilder ngp = null;
@@ -367,7 +373,9 @@ public class TikaCLI {
password = arg.substring("--password=".length());
} else if (arg.equals("-j") || arg.equals("--json")) {
type = JSON;
- } else if (arg.equals("-y") || arg.equals("--xmp")) {
+ } else if (arg.equals("-J") || arg.equals("--jsonRecursive")) {
+ recursiveJSON = true;
+ } else if (arg.equals("-y") || arg.equals("--xmp")) {
type = XMP;
} else if (arg.equals("-x") || arg.equals("--xml")) {
type = XML;
@@ -423,18 +431,55 @@ public class TikaCLI {
} else {
url = new URL(arg);
}
- Metadata metadata = new Metadata();
- InputStream input = TikaInputStream.get(url, metadata);
- try {
- type.process(input, System.out, metadata);
- } finally {
- input.close();
- System.out.flush();
+ if (recursiveJSON) {
+ handleRecursiveJson(url, System.out);
+ } else {
+ Metadata metadata = new Metadata();
+ InputStream input = TikaInputStream.get(url, metadata);
+ try {
+ type.process(input, System.out, metadata);
+ } finally {
+ input.close();
+ System.out.flush();
+ }
}
}
}
}
+ private void handleRecursiveJson(URL url, OutputStream output) throws IOException, SAXException, TikaException {
+ Metadata metadata = new Metadata();
+ InputStream input = TikaInputStream.get(url, metadata);
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, getContentHandlerFactory(type));
+ try {
+ wrapper.parse(input, null, metadata, context);
+ } finally {
+ input.close();
+ }
+ JsonMetadataList.setPrettyPrinting(prettyPrint);
+ Writer writer = getOutputWriter(output, encoding);
+ try {
+ JsonMetadataList.toJson(wrapper.getMetadata(), writer);
+ } finally {
+ writer.flush();
+ }
+ }
+
+ private ContentHandlerFactory getContentHandlerFactory(OutputType type) {
+ BasicContentHandlerFactory.HANDLER_TYPE handlerType = BasicContentHandlerFactory.HANDLER_TYPE.IGNORE;
+ if (type.equals(HTML)) {
+ handlerType = BasicContentHandlerFactory.HANDLER_TYPE.HTML;
+ } else if (type.equals(XML)) {
+ handlerType = BasicContentHandlerFactory.HANDLER_TYPE.XML;
+ } else if (type.equals(TEXT)) {
+ handlerType = BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
+ } else if (type.equals(TEXT_MAIN)) {
+ handlerType = BasicContentHandlerFactory.HANDLER_TYPE.BODY;
+ } else if (type.equals(METADATA)) {
+ handlerType = BasicContentHandlerFactory.HANDLER_TYPE.IGNORE;
+ }
+ return new BasicContentHandlerFactory(handlerType, -1);
+ }
private void usage() {
PrintStream out = System.out;
out.println("usage: java -jar tika-app.jar [option...] [file|port...]");
@@ -458,13 +503,16 @@ public class TikaCLI {
out.println(" -m or --metadata Output only metadata");
out.println(" -j or --json Output metadata in JSON");
out.println(" -y or --xmp Output metadata in XMP");
+ out.println(" -J or --jsonRecursive Output metadata and content from all");
+ out.println(" embedded files (choose content type");
+ out.println(" with -x, -h, -t or -m; default is -x)");
out.println(" -l or --language Output only language");
out.println(" -d or --detect Detect document type");
out.println(" -eX or --encoding=X Use output encoding X");
out.println(" -pX or --password=X Use document password X");
out.println(" -z or --extract Extract all attachements into current directory");
out.println(" --extract-dir=<dir> Specify target directory for -z");
- out.println(" -r or --pretty-print For XML and XHTML outputs, adds newlines and");
+ out.println(" -r or --pretty-print For JSON, XML and XHTML outputs, adds newlines and");
out.println(" whitespace, for better readability");
out.println();
out.println(" --create-profile=X");
@@ -950,6 +998,7 @@ public class TikaCLI {
@Override
public void endDocument() throws SAXException {
try {
+ JsonMetadata.setPrettyPrinting(prettyPrint);
JsonMetadata.toJson(metadata, writer);
writer.flush();
} catch (TikaException e) {
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=1633499&r1=1633498&r2=1633499&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java Wed Oct 22 00:31:37 2014
@@ -16,27 +16,6 @@
*/
package org.apache.tika.gui;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.DocumentSelector;
-import org.apache.tika.io.IOUtils;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.html.BoilerpipeContentHandler;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.ContentHandlerDecorator;
-import org.apache.tika.sax.TeeContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
import javax.swing.Box;
import javax.swing.JDialog;
import javax.swing.JEditorPane;
@@ -82,6 +61,30 @@ import java.util.HashMap;
import java.util.Map;
import java.util.Set;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.DocumentSelector;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
/**
* Simple Swing GUI for Apache Tika. You can drag and drop files on top
* of the window to have them parsed.
@@ -117,6 +120,8 @@ public class TikaGUI extends JFrame
});
}
+ //maximum length to allow for mark for reparse to get JSON
+ private final int MAX_MARK = 20971520;//20MB
/**
* Parsing context.
*/
@@ -163,6 +168,11 @@ public class TikaGUI extends JFrame
private final JEditorPane xml;
/**
+ * Raw JSON source.
+ */
+ private final JEditorPane json;
+
+ /**
* Document metadata.
*/
private final JEditorPane metadata;
@@ -185,6 +195,7 @@ public class TikaGUI extends JFrame
text = addCard(cards, "text/plain", "text");
textMain = addCard(cards, "text/plain", "main");
xml = addCard(cards, "text/plain", "xhtml");
+ json = addCard(cards, "text/plain", "json");
add(cards);
layout.show(cards, "welcome");
@@ -217,6 +228,7 @@ public class TikaGUI extends JFrame
addMenuItem(view, "Plain text", "text", KeyEvent.VK_P);
addMenuItem(view, "Main content", "main", KeyEvent.VK_C);
addMenuItem(view, "Structured text", "xhtml", KeyEvent.VK_S);
+ addMenuItem(view, "Recursive JSON", "json", KeyEvent.VK_J);
bar.add(view);
bar.add(Box.createHorizontalGlue());
@@ -267,6 +279,8 @@ public class TikaGUI extends JFrame
layout.show(cards, command);
} else if ("metadata".equals(command)) {
layout.show(cards, command);
+ } else if ("json".equals(command)) {
+ layout.show(cards, command);
} else if ("about".equals(command)) {
textDialog(
"About Apache Tika",
@@ -320,7 +334,9 @@ public class TikaGUI extends JFrame
getXmlContentHandler(xmlBuffer));
context.set(DocumentSelector.class, new ImageDocumentSelector());
-
+ if (input.markSupported()) {
+ input.mark(MAX_MARK);
+ }
input = new ProgressMonitorInputStream(
this, "Parsing stream", input);
parser.parse(input, handler, md, context);
@@ -346,6 +362,30 @@ public class TikaGUI extends JFrame
setText(text, textBuffer.toString());
setText(textMain, textMainBuffer.toString());
setText(html, htmlBuffer.toString());
+ if (!input.markSupported()) {
+ setText(json, "InputStream does not support mark/reset for Recursive Parsing");
+ layout.show(cards, "metadata");
+ return;
+ }
+ boolean isReset = false;
+ try {
+ input.reset();
+ isReset = true;
+ } catch (IOException e) {
+ setText(json, "Error during stream reset.\n"+
+ "There's a limit of "+MAX_MARK + " bytes for this type of processing in the GUI.\n"+
+ "Try the app with command line argument of -J."
+ );
+ }
+ if (isReset) {
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
+ wrapper.parse(input, null, new Metadata(), new ParseContext());
+ StringWriter jsonBuffer = new StringWriter();
+ JsonMetadataList.setPrettyPrinting(true);
+ JsonMetadataList.toJson(wrapper.getMetadata(), jsonBuffer);
+ setText(json, jsonBuffer.toString());
+ }
layout.show(cards, "metadata");
}
Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1633499&r1=1633498&r2=1633499&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java (original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Wed Oct 22 00:31:37 2014
@@ -16,17 +16,17 @@
*/
package org.apache.tika.cli;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.PrintStream;
+import java.net.URI;
+
import org.apache.commons.io.FileUtils;
import org.apache.tika.exception.TikaException;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.PrintStream;
-import java.net.URI;
-
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
@@ -160,6 +160,29 @@ public class TikaCLITest {
}
/**
+ * Test for -json with prettyprint option
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testJsonMetadataPrettyPrintOutput() throws Exception {
+ String[] params = {"--json", "-r", resourcePrefix + "testJsonMultipleInts.html"};
+ TikaCLI.main(params);
+ String json = outContent.toString("UTF-8");
+
+ assertTrue(json.contains(" \"X-Parsed-By\": [\n" +
+ " \"org.apache.tika.parser.DefaultParser\",\n" +
+ " \"org.apache.tika.parser.html.HtmlParser\"\n" +
+ " ],\n"));
+ //test legacy alphabetic sort of keys
+ int enc = json.indexOf("\"Content-Encoding\"");
+ int fb = json.indexOf("fb:admins");
+ int title = json.indexOf("\"title\"");
+ assertTrue(enc > -1 && fb > -1 && enc < fb);
+ assertTrue (fb > -1 && title > -1 && fb < title);
+ }
+
+ /**
* Tests -l option of the cli
*
* @throws Exception
@@ -321,4 +344,40 @@ public class TikaCLITest {
assertTrue(content.contains("apple"));
assertTrue(content.contains("org.apache.tika.parser.html.HtmlParser"));
}
+
+ @Test
+ public void testJsonRecursiveMetadataParserMetadataOnly() throws Exception {
+ String[] params = new String[]{"-m", "-J", "-r", resourcePrefix+"test_recursive_embedded.docx"};
+ TikaCLI.main(params);
+ String content = outContent.toString("UTF-8");
+ assertTrue(content.contains("[\n" +
+ " {\n" +
+ " \"Application-Name\": \"Microsoft Office Word\",\n" +
+ " \"Application-Version\": \"15.0000\",\n" +
+ " \"Character Count\": \"28\",\n" +
+ " \"Character-Count-With-Spaces\": \"31\","));
+ assertTrue(content.endsWith(" \"tika:embedded_resource_path\": \"test_recursive_embedded.docx/embed1.zip\"\n" +
+ " }\n" +
+ "]"));
+ assertFalse(content.contains("tika:content"));
+
+ }
+
+ @Test
+ public void testJsonRecursiveMetadataParserDefault() throws Exception {
+ String[] params = new String[]{"-J", "-r", resourcePrefix+"test_recursive_embedded.docx"};
+ TikaCLI.main(params);
+ String content = outContent.toString("UTF-8");
+ assertTrue(content.contains("\"tika:content\": \"\\u003chtml xmlns\\u003d\\\"http://www.w3.org/1999/xhtml"));
+ }
+
+ @Test
+ public void testJsonRecursiveMetadataParserText() throws Exception {
+ String[] params = new String[]{"-J", "-r", "-t", resourcePrefix+"test_recursive_embedded.docx"};
+ TikaCLI.main(params);
+ String content = outContent.toString("UTF-8");
+ assertTrue(content.contains("\\n\\nembed_4\\n"));
+ assertTrue(content.contains("\\n\\nembed_0"));
+ }
+
}
Added: tika/trunk/tika-app/src/test/resources/test-data/test_recursive_embedded.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/resources/test-data/test_recursive_embedded.docx?rev=1633499&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-app/src/test/resources/test-data/test_recursive_embedded.docx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadata.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadata.java?rev=1633499&r1=1633498&r2=1633499&view=diff
==============================================================================
--- tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadata.java (original)
+++ tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadata.java Wed Oct 22 00:31:37 2014
@@ -21,25 +21,17 @@ package org.apache.tika.metadata.seriali
import java.io.Reader;
import java.io.Writer;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-
import com.google.gson.Gson;
-import com.google.gson.GsonBuilder;
import com.google.gson.JsonIOException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
-public class JsonMetadata {
-
+public class JsonMetadata extends JsonMetadataBase{
private static Gson GSON;
-
+
static {
- GsonBuilder builder = new GsonBuilder();
- builder.registerTypeHierarchyAdapter(Metadata.class, new JsonMetadataSerializer());
- builder.registerTypeHierarchyAdapter(Metadata.class, new JsonMetadataDeserializer());
- GSON = builder.create();
+ GSON = defaultInit();
}
-
-
/**
* Serializes a Metadata object to Json. This does not flush or close the writer.
*
@@ -72,15 +64,24 @@ public class JsonMetadata {
}
return m;
}
-
+
/**
* Enables setting custom configurations on Gson. Remember to register
* a serializer and a deserializer for Metadata. This does a literal set
* and does not add the default serializer and deserializers.
- *
+ *
* @param gson
*/
public static void setGson(Gson gson) {
GSON = gson;
}
+
+ public static void setPrettyPrinting(boolean prettyPrint) {
+ if (prettyPrint) {
+ GSON = prettyInit();
+ } else {
+ GSON = defaultInit();
+ }
+ }
+
}
Added: tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataBase.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataBase.java?rev=1633499&view=auto
==============================================================================
--- tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataBase.java (added)
+++ tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataBase.java Wed Oct 22 00:31:37 2014
@@ -0,0 +1,76 @@
+package org.apache.tika.metadata.serialization;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Arrays;
+
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+import org.apache.tika.metadata.Metadata;
+
+public class JsonMetadataBase {
+
+
+ static Gson defaultInit() {
+ GsonBuilder builder = new GsonBuilder();
+ builder.registerTypeHierarchyAdapter(Metadata.class, new JsonMetadataSerializer());
+ builder.registerTypeHierarchyAdapter(Metadata.class, new JsonMetadataDeserializer());
+ return builder.create();
+ }
+
+ static Gson prettyInit() {
+ GsonBuilder builder = new GsonBuilder();
+ builder.registerTypeHierarchyAdapter(Metadata.class, new SortedJsonMetadataSerializer());
+ builder.registerTypeHierarchyAdapter(Metadata.class, new JsonMetadataDeserializer());
+ builder.setPrettyPrinting();
+ return builder.create();
+ }
+
+ private static class SortedJsonMetadataSerializer extends JsonMetadataSerializer {
+ @Override
+ public String[] getNames(Metadata m) {
+ String[] names = m.names();
+ Arrays.sort(names, new MetadataKeyComparator());
+ return names;
+ }
+
+ private class MetadataKeyComparator implements java.util.Comparator<String> {
+ @Override
+ public int compare(String s1, String s2) {
+ if (s1 == null) {
+ return 1;
+ } else if (s2 == null) {
+ return -1;
+ }
+
+ //this is stinky. This should reference RecursiveParserWrapper.TIKA_CONTENT
+ //but that would require making core a dependency of serialization...
+ //do we want to do that?
+ if (s1.equals("tika:content")) {
+ if (s2.equals("tika:content")) {
+ return 0;
+ }
+ return 2;
+ } else if (s2.equals("tika:content")) {
+ return -2;
+ }
+ return s1.compareTo(s2);
+ }
+ }
+ }
+}
Added: tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java?rev=1633499&view=auto
==============================================================================
--- tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java (added)
+++ tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java Wed Oct 22 00:31:37 2014
@@ -0,0 +1,96 @@
+package org.apache.tika.metadata.serialization;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.Reader;
+import java.io.Writer;
+import java.lang.reflect.Type;
+import java.util.List;
+
+import com.google.gson.Gson;
+import com.google.gson.JsonIOException;
+import com.google.gson.reflect.TypeToken;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+public class JsonMetadataList extends JsonMetadataBase {
+
+ private final static Type listType = new TypeToken<List<Metadata>>(){}.getType();
+ private static Gson GSON;
+ static {
+ GSON = defaultInit();
+ }
+
+ /**
+ * Serializes a Metadata object to Json. This does not flush or close the writer.
+ *
+ * @param metadataList list of metadata to write
+ * @param writer writer
+ * @throws org.apache.tika.exception.TikaException if there is an IOException during writing
+ */
+ public static void toJson(List<Metadata> metadataList, Writer writer) throws TikaException {
+ try {
+ GSON.toJson(metadataList, writer);
+ } catch (JsonIOException e) {
+ throw new TikaException(e.getMessage());
+ }
+ }
+
+ /**
+ * Read metadata from reader.
+ *
+ * @param reader
+ * @return Metadata or null if nothing could be read from the reader
+ * @throws org.apache.tika.exception.TikaException in case of parse failure by Gson or IO failure with Reader
+ */
+ public static List<Metadata> fromJson(Reader reader) throws TikaException {
+ List<Metadata> ms = null;
+ if (reader == null) {
+ return ms;
+ }
+ try {
+ ms = GSON.fromJson(reader, listType);
+ } catch (com.google.gson.JsonParseException e){
+ //covers both io and parse exceptions
+ throw new TikaException(e.getMessage());
+ }
+ return ms;
+ }
+
+ /**
+ * Enables setting custom configurations on Gson. Remember to register
+ * a serializer and a deserializer for Metadata. This does a literal set
+ * and does not add the default serializer and deserializers.
+ *
+ * @param gson
+ */
+ public static void setGson(Gson gson) {
+ GSON = gson;
+ }
+
+ public static void setPrettyPrinting(boolean prettyPrint) {
+ if (prettyPrint) {
+ GSON = prettyInit();
+ } else {
+ GSON = defaultInit();
+ }
+ }
+
+
+}
Added: tika/trunk/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonMetadataListTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonMetadataListTest.java?rev=1633499&view=auto
==============================================================================
--- tika/trunk/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonMetadataListTest.java (added)
+++ tika/trunk/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonMetadataListTest.java Wed Oct 22 00:31:37 2014
@@ -0,0 +1,123 @@
+package org.apache.tika.metadata.serialization;
+
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+import java.io.StringReader;
+import java.io.StringWriter;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.tika.metadata.Metadata;
+import org.junit.Test;
+
+import static junit.framework.Assert.assertTrue;
+import static junit.framework.TestCase.assertNull;
+import static org.junit.Assert.assertEquals;
+
+public class JsonMetadataListTest {
+
+
+ @Test
+ public void testListBasic() throws Exception {
+ Metadata m1 = new Metadata();
+ m1.add("k1", "v1");
+ m1.add("k1", "v2");
+ m1.add("k1", "v3");
+ m1.add("k1", "v4");
+ m1.add("k1", "v4");
+ m1.add("k2", "v1");
+
+ Metadata m2 = new Metadata();
+ m2.add("k3", "v1");
+ m2.add("k3", "v2");
+ m2.add("k3", "v3");
+ m2.add("k3", "v4");
+ m2.add("k3", "v4");
+ m2.add("k4", "v1");
+
+ List<Metadata> metadataList = new LinkedList<Metadata>();
+ metadataList.add(m1);
+ metadataList.add(m2);
+ StringWriter writer = new StringWriter();
+ JsonMetadataList.toJson(metadataList, writer);
+ List<Metadata> deserialized = JsonMetadataList.fromJson(new StringReader(writer.toString()));
+ assertEquals(metadataList, deserialized);
+ }
+
+ @Test
+ public void testListNull() throws Exception {
+ StringWriter writer = new StringWriter();
+ JsonMetadataList.toJson(null, writer);
+ assertEquals("", writer.toString().trim());
+
+ List<Metadata> m = JsonMetadataList.fromJson(null);
+ assertNull(m);
+ }
+
+ @Test
+ public void testListCorrupted() throws Exception {
+ String json = "[{\"k1\":[\"v1\",\"v2\",\"v3\",\"v4\",\"v4\"],\"k2\":\"v1\"}," +
+ "\"k3\":[\"v1\",\"v2\",\"v3\",\"v4\",\"v4\"],\"k4\":\"v1\"}]";
+ List<Metadata> m = JsonMetadataList.fromJson(null);
+ assertNull(m);
+ }
+
+ @Test
+ public void testPrettyPrint() throws Exception {
+ Metadata m1 = new Metadata();
+ m1.add("tika:content", "this is the content");
+ m1.add("zk1", "v1");
+ m1.add("zk1", "v2");
+ m1.add("zk1", "v3");
+ m1.add("zk1", "v4");
+ m1.add("zk1", "v4");
+ m1.add("zk2", "v1");
+
+ Metadata m2 = new Metadata();
+ m2.add("k3", "v1");
+ m2.add("k3", "v2");
+ m2.add("k3", "v3");
+ m2.add("k3", "v4");
+ m2.add("k3", "v4");
+ m2.add("k4", "v1");
+
+ List<Metadata> metadataList = new LinkedList<Metadata>();
+ metadataList.add(m1);
+ metadataList.add(m2);
+ StringWriter writer = new StringWriter();
+ JsonMetadataList.toJson(metadataList, writer);
+ assertTrue(writer.toString().startsWith("[{\"tika:content\":\"this is the content\",\"zk1\":[\"v1\",\"v2\","));
+ writer = new StringWriter();
+ JsonMetadataList.setPrettyPrinting(true);
+ JsonMetadataList.toJson(metadataList, writer);
+ assertTrue(writer.toString().startsWith("[\n" +
+ " {\n" +
+ " \"zk1\": [\n" +
+ " \"v1\",\n" +
+ " \"v2\","));
+ assertTrue(writer.toString().contains(" \"zk2\": \"v1\",\n" +
+ " \"tika:content\": \"this is the content\"\n" +
+ " },"));
+
+ //now set it back to false
+ JsonMetadataList.setPrettyPrinting(false);
+ writer = new StringWriter();
+ JsonMetadataList.toJson(metadataList, writer);
+ assertTrue(writer.toString().startsWith("[{\"tika:content\":\"this is the content\",\"zk1\":[\"v1\",\"v2\","));
+ }
+}
Modified: tika/trunk/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonMetadataTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonMetadataTest.java?rev=1633499&r1=1633498&r2=1633499&view=diff
==============================================================================
--- tika/trunk/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonMetadataTest.java (original)
+++ tika/trunk/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonMetadataTest.java Wed Oct 22 00:31:37 2014
@@ -57,6 +57,24 @@ public class JsonMetadataTest {
//test that this really is 6 Chinese characters
assertEquals(6, deserialized.get("alma_mater").length());
+
+ //now test pretty print;
+ writer = new StringWriter();
+ JsonMetadata.setPrettyPrinting(true);
+ JsonMetadata.toJson(metadata, writer);
+ assertTrue(writer.toString().contains(
+ " \"json_escapes\": \"the: \\\"quick\\\" brown, fox\",\n" +
+ " \"k1\": [\n" +
+ " \"v1\",\n" +
+ " \"v2\"\n" +
+ " ],\n" +
+ " \"k3\": [\n" +
+ " \"v3\",\n" +
+ " \"v3\"\n" +
+ " ],\n" +
+ " \"k4\": \"500,000\",\n" +
+ " \"url\": \"/myApp/myAction.html?method\\u003drouter\\u0026cmd\\u003d1\"\n" +
+ "}"));
}
@Test
@@ -111,5 +129,4 @@ public class JsonMetadataTest {
Metadata deserialized = JsonMetadata.fromJson(new StringReader(writer.toString()));
assertEquals(m, deserialized);
}
-
}