You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/10/22 02:31:38 UTC

svn commit: r1633499 - in /tika/trunk: ./ tika-app/src/main/java/org/apache/tika/cli/ tika-app/src/main/java/org/apache/tika/gui/ tika-app/src/test/java/org/apache/tika/cli/ tika-app/src/test/resources/test-data/ tika-serialization/src/main/java/org/ap...

Author: tallison
Date: Wed Oct 22 00:31:37 2014
New Revision: 1633499

URL: http://svn.apache.org/r1633499
Log:
TIKA-1451 add RecursiveParserWrapper output to CLI and GUI

Added:
    tika/trunk/tika-app/src/test/resources/test-data/test_recursive_embedded.docx   (with props)
    tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataBase.java
    tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java
    tika/trunk/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonMetadataListTest.java
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
    tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
    tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
    tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadata.java
    tika/trunk/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonMetadataTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1633499&r1=1633498&r2=1633499&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Wed Oct 22 00:31:37 2014
@@ -1,12 +1,15 @@
 Release 1.7 - Current Development
 
+  * Tika CLI and GUI now have option to view JSON rendering of output
+    of RecursiveParserWrapper (TIKA-1451).
+
   * Tika now integrates the Geospatial Data Abstraction Library
     (GDAL) for parsing hundreds of geospatial formats (TIKA-605).
 
   * ExternalParsers can now use Regexs to specify dynamic keys
    (TIKA-1441).
 
-  * Thread safety issue in ImageMetadataExtractor were resolved
+  * Thread safety issues in ImageMetadataExtractor were resolved
     (TIKA-1369).
  
   * The ForkParser service is now registered in Activator

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1633499&r1=1633498&r2=1633499&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Wed Oct 22 00:31:37 2014
@@ -16,6 +16,37 @@
  */
 package org.apache.tika.cli;
 
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintStream;
+import java.io.PrintWriter;
+import java.io.UnsupportedEncodingException;
+import java.io.Writer;
+import java.lang.reflect.Field;
+import java.net.ServerSocket;
+import java.net.Socket;
+import java.net.URI;
+import java.net.URL;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.log4j.BasicConfigurator;
@@ -44,6 +75,7 @@ import org.apache.tika.language.Language
 import org.apache.tika.language.ProfilingHandler;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.serialization.JsonMetadata;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MediaTypeRegistry;
 import org.apache.tika.mime.MimeTypeException;
@@ -54,45 +86,17 @@ import org.apache.tika.parser.ParseConte
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
 import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ContentHandlerFactory;
 import org.apache.tika.sax.ExpandedTitleContentHandler;
 import org.apache.tika.xmp.XMPMetadata;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.TransformerConfigurationException;
-import javax.xml.transform.sax.SAXTransformerFactory;
-import javax.xml.transform.sax.TransformerHandler;
-import javax.xml.transform.stream.StreamResult;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.io.PrintStream;
-import java.io.PrintWriter;
-import java.io.UnsupportedEncodingException;
-import java.io.Writer;
-import java.lang.reflect.Field;
-import java.net.ServerSocket;
-import java.net.Socket;
-import java.net.URI;
-import java.net.URL;
-import java.nio.charset.Charset;
-import java.util.Arrays;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Set;
-
 /**
  * Simple command line interface for Apache Tika.
  */
@@ -281,6 +285,8 @@ public class TikaCLI {
     private String configFilePath;
 
     private OutputType type = XML;
+
+    private boolean recursiveJSON = false;
     
     private LanguageProfilerBuilder ngp = null;
 
@@ -367,7 +373,9 @@ public class TikaCLI {
             password = arg.substring("--password=".length());
         } else  if (arg.equals("-j") || arg.equals("--json")) {
             type = JSON;
-        } else  if (arg.equals("-y") || arg.equals("--xmp")) {
+        } else if (arg.equals("-J") || arg.equals("--jsonRecursive")) {
+            recursiveJSON = true;
+        } else if (arg.equals("-y") || arg.equals("--xmp")) {
             type = XMP;
         } else if (arg.equals("-x") || arg.equals("--xml")) {
             type = XML;
@@ -423,18 +431,55 @@ public class TikaCLI {
                 } else {
                     url = new URL(arg);
                 }
-                Metadata metadata = new Metadata();
-                InputStream input = TikaInputStream.get(url, metadata);
-                try {
-                    type.process(input, System.out, metadata);
-                } finally {
-                    input.close();
-                    System.out.flush();
+                if (recursiveJSON) {
+                    handleRecursiveJson(url, System.out);
+                } else {
+                    Metadata metadata = new Metadata();
+                    InputStream input = TikaInputStream.get(url, metadata);
+                    try {
+                        type.process(input, System.out, metadata);
+                    } finally {
+                        input.close();
+                        System.out.flush();
+                    }
                 }
             }
         }
     }
 
+    private void handleRecursiveJson(URL url, OutputStream output) throws IOException, SAXException, TikaException {
+        Metadata metadata = new Metadata();
+        InputStream input = TikaInputStream.get(url, metadata);
+        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, getContentHandlerFactory(type));
+        try {
+            wrapper.parse(input, null, metadata, context);
+        } finally {
+            input.close();
+        }
+        JsonMetadataList.setPrettyPrinting(prettyPrint);
+        Writer writer = getOutputWriter(output, encoding);
+        try {
+            JsonMetadataList.toJson(wrapper.getMetadata(), writer);
+        } finally {
+            writer.flush();
+        }
+    }
+
+    private ContentHandlerFactory getContentHandlerFactory(OutputType type) {
+        BasicContentHandlerFactory.HANDLER_TYPE handlerType = BasicContentHandlerFactory.HANDLER_TYPE.IGNORE;
+        if (type.equals(HTML)) {
+            handlerType = BasicContentHandlerFactory.HANDLER_TYPE.HTML;
+        } else if (type.equals(XML)) {
+            handlerType = BasicContentHandlerFactory.HANDLER_TYPE.XML;
+        } else if (type.equals(TEXT)) {
+            handlerType = BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
+        } else if (type.equals(TEXT_MAIN)) {
+            handlerType = BasicContentHandlerFactory.HANDLER_TYPE.BODY;
+        } else if (type.equals(METADATA)) {
+            handlerType = BasicContentHandlerFactory.HANDLER_TYPE.IGNORE;
+        }
+        return new BasicContentHandlerFactory(handlerType, -1);
+    }
     private void usage() {
         PrintStream out = System.out;
         out.println("usage: java -jar tika-app.jar [option...] [file|port...]");
@@ -458,13 +503,16 @@ public class TikaCLI {
         out.println("    -m  or --metadata      Output only metadata");
         out.println("    -j  or --json          Output metadata in JSON");
         out.println("    -y  or --xmp           Output metadata in XMP");
+        out.println("    -J  or --jsonRecursive Output metadata and content from all");
+        out.println("                           embedded files (choose content type");
+        out.println("                           with -x, -h, -t or -m; default is -x)");
         out.println("    -l  or --language      Output only language");
         out.println("    -d  or --detect        Detect document type");
         out.println("    -eX or --encoding=X    Use output encoding X");
         out.println("    -pX or --password=X    Use document password X");
         out.println("    -z  or --extract       Extract all attachements into current directory");
         out.println("    --extract-dir=<dir>    Specify target directory for -z");
-        out.println("    -r  or --pretty-print  For XML and XHTML outputs, adds newlines and");
+        out.println("    -r  or --pretty-print  For JSON, XML and XHTML outputs, adds newlines and");
         out.println("                           whitespace, for better readability");
         out.println();
         out.println("    --create-profile=X");
@@ -950,6 +998,7 @@ public class TikaCLI {
         @Override
         public void endDocument() throws SAXException {
             try {
+                JsonMetadata.setPrettyPrinting(prettyPrint);
                 JsonMetadata.toJson(metadata, writer);
                 writer.flush();
             } catch (TikaException e) {

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=1633499&r1=1633498&r2=1633499&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java Wed Oct 22 00:31:37 2014
@@ -16,27 +16,6 @@
  */
 package org.apache.tika.gui;
 
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.DocumentSelector;
-import org.apache.tika.io.IOUtils;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.html.BoilerpipeContentHandler;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.ContentHandlerDecorator;
-import org.apache.tika.sax.TeeContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
 import javax.swing.Box;
 import javax.swing.JDialog;
 import javax.swing.JEditorPane;
@@ -82,6 +61,30 @@ import java.util.HashMap;
 import java.util.Map;
 import java.util.Set;
 
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.DocumentSelector;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
 /**
  * Simple Swing GUI for Apache Tika. You can drag and drop files on top
  * of the window to have them parsed.
@@ -117,6 +120,8 @@ public class TikaGUI extends JFrame
         });
     }
 
+    //maximum length to allow for mark for reparse to get JSON
+    private final int MAX_MARK = 20971520;//20MB
     /**
      * Parsing context.
      */
@@ -163,6 +168,11 @@ public class TikaGUI extends JFrame
     private final JEditorPane xml;
 
     /**
+     * Raw JSON source.
+     */
+    private final JEditorPane json;
+
+    /**
      * Document metadata.
      */
     private final JEditorPane metadata;
@@ -185,6 +195,7 @@ public class TikaGUI extends JFrame
         text = addCard(cards, "text/plain", "text");
         textMain = addCard(cards, "text/plain", "main");
         xml = addCard(cards, "text/plain", "xhtml");
+        json = addCard(cards, "text/plain", "json");
         add(cards);
         layout.show(cards, "welcome");
 
@@ -217,6 +228,7 @@ public class TikaGUI extends JFrame
         addMenuItem(view, "Plain text", "text", KeyEvent.VK_P);
         addMenuItem(view, "Main content", "main", KeyEvent.VK_C);
         addMenuItem(view, "Structured text", "xhtml", KeyEvent.VK_S);
+        addMenuItem(view, "Recursive JSON", "json", KeyEvent.VK_J);
         bar.add(view);
 
         bar.add(Box.createHorizontalGlue());
@@ -267,6 +279,8 @@ public class TikaGUI extends JFrame
             layout.show(cards, command);
         } else if ("metadata".equals(command)) {
             layout.show(cards, command);
+        } else if ("json".equals(command)) {
+            layout.show(cards, command);
         } else if ("about".equals(command)) {
             textDialog(
                     "About Apache Tika",
@@ -320,7 +334,9 @@ public class TikaGUI extends JFrame
                 getXmlContentHandler(xmlBuffer));
 
         context.set(DocumentSelector.class, new ImageDocumentSelector());
-
+        if (input.markSupported()) {
+            input.mark(MAX_MARK);
+        }
         input = new ProgressMonitorInputStream(
                 this, "Parsing stream", input);
         parser.parse(input, handler, md, context);
@@ -346,6 +362,30 @@ public class TikaGUI extends JFrame
         setText(text, textBuffer.toString());
         setText(textMain, textMainBuffer.toString());
         setText(html, htmlBuffer.toString());
+        if (!input.markSupported()) {
+            setText(json, "InputStream does not support mark/reset for Recursive Parsing");
+            layout.show(cards, "metadata");
+            return;
+        }
+        boolean isReset = false;
+        try {
+            input.reset();
+            isReset = true;
+        } catch (IOException e) {
+            setText(json, "Error during stream reset.\n"+
+                    "There's a limit of "+MAX_MARK + " bytes for this type of processing in the GUI.\n"+
+                    "Try the app with command line argument of -J."
+            );
+        }
+        if (isReset) {
+            RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser,
+                    new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
+            wrapper.parse(input, null, new Metadata(), new ParseContext());
+            StringWriter jsonBuffer = new StringWriter();
+            JsonMetadataList.setPrettyPrinting(true);
+            JsonMetadataList.toJson(wrapper.getMetadata(), jsonBuffer);
+            setText(json, jsonBuffer.toString());
+        }
         layout.show(cards, "metadata");
     }
 

Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1633499&r1=1633498&r2=1633499&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java (original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Wed Oct 22 00:31:37 2014
@@ -16,17 +16,17 @@
  */
 package org.apache.tika.cli;
 
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.PrintStream;
+import java.net.URI;
+
 import org.apache.commons.io.FileUtils;
 import org.apache.tika.exception.TikaException;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
 
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.PrintStream;
-import java.net.URI;
-
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
@@ -160,6 +160,29 @@ public class TikaCLITest {
     }
 
     /**
+     * Test for -json with prettyprint option
+     *
+     * @throws Exception
+     */
+    @Test
+    public void testJsonMetadataPrettyPrintOutput() throws Exception {
+        String[] params = {"--json", "-r", resourcePrefix + "testJsonMultipleInts.html"};
+        TikaCLI.main(params);
+        String json = outContent.toString("UTF-8");
+
+        assertTrue(json.contains("  \"X-Parsed-By\": [\n" +
+                "    \"org.apache.tika.parser.DefaultParser\",\n" +
+                "    \"org.apache.tika.parser.html.HtmlParser\"\n" +
+                "  ],\n"));
+        //test legacy alphabetic sort of keys
+        int enc = json.indexOf("\"Content-Encoding\"");
+        int fb = json.indexOf("fb:admins");
+        int title = json.indexOf("\"title\"");
+        assertTrue(enc > -1 && fb > -1 && enc < fb);
+        assertTrue (fb > -1 && title > -1 && fb < title);
+    }
+
+    /**
      * Tests -l option of the cli
      * 
      * @throws Exception
@@ -321,4 +344,40 @@ public class TikaCLITest {
         assertTrue(content.contains("apple"));
         assertTrue(content.contains("org.apache.tika.parser.html.HtmlParser"));
     }
+
+    @Test
+    public void testJsonRecursiveMetadataParserMetadataOnly() throws Exception {
+        String[] params = new String[]{"-m", "-J", "-r", resourcePrefix+"test_recursive_embedded.docx"};
+        TikaCLI.main(params);
+        String content = outContent.toString("UTF-8");
+        assertTrue(content.contains("[\n" +
+                "  {\n" +
+                "    \"Application-Name\": \"Microsoft Office Word\",\n" +
+                "    \"Application-Version\": \"15.0000\",\n" +
+                "    \"Character Count\": \"28\",\n" +
+                "    \"Character-Count-With-Spaces\": \"31\","));
+        assertTrue(content.endsWith("    \"tika:embedded_resource_path\": \"test_recursive_embedded.docx/embed1.zip\"\n" +
+                "  }\n" +
+                "]"));
+        assertFalse(content.contains("tika:content"));
+
+    }
+
+    @Test
+    public void testJsonRecursiveMetadataParserDefault() throws Exception {
+        String[] params = new String[]{"-J", "-r", resourcePrefix+"test_recursive_embedded.docx"};
+        TikaCLI.main(params);
+        String content = outContent.toString("UTF-8");
+        assertTrue(content.contains("\"tika:content\": \"\\u003chtml xmlns\\u003d\\\"http://www.w3.org/1999/xhtml"));
+    }
+
+    @Test
+    public void testJsonRecursiveMetadataParserText() throws Exception {
+        String[] params = new String[]{"-J", "-r", "-t", resourcePrefix+"test_recursive_embedded.docx"};
+        TikaCLI.main(params);
+        String content = outContent.toString("UTF-8");
+        assertTrue(content.contains("\\n\\nembed_4\\n"));
+        assertTrue(content.contains("\\n\\nembed_0"));
+    }
+
 }

Added: tika/trunk/tika-app/src/test/resources/test-data/test_recursive_embedded.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/resources/test-data/test_recursive_embedded.docx?rev=1633499&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-app/src/test/resources/test-data/test_recursive_embedded.docx
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadata.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadata.java?rev=1633499&r1=1633498&r2=1633499&view=diff
==============================================================================
--- tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadata.java (original)
+++ tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadata.java Wed Oct 22 00:31:37 2014
@@ -21,25 +21,17 @@ package org.apache.tika.metadata.seriali
 import java.io.Reader;
 import java.io.Writer;
 
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-
 import com.google.gson.Gson;
-import com.google.gson.GsonBuilder;
 import com.google.gson.JsonIOException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
 
-public class JsonMetadata {
-    
+public class JsonMetadata extends JsonMetadataBase{
     private static Gson GSON;
-    
+
     static {
-        GsonBuilder builder = new GsonBuilder();
-        builder.registerTypeHierarchyAdapter(Metadata.class, new JsonMetadataSerializer());
-        builder.registerTypeHierarchyAdapter(Metadata.class, new JsonMetadataDeserializer());
-        GSON = builder.create();
+        GSON = defaultInit();
     }
-
-    
     /**
      * Serializes a Metadata object to Json.  This does not flush or close the writer.
      * 
@@ -72,15 +64,24 @@ public class JsonMetadata {
         }
         return m;
     }
-    
+
     /**
      * Enables setting custom configurations on Gson.  Remember to register
      * a serializer and a deserializer for Metadata.  This does a literal set
      * and does not add the default serializer and deserializers.
-     * 
+     *
      * @param gson
      */
     public static void setGson(Gson gson) {
         GSON = gson;
     }
+
+    public static void setPrettyPrinting(boolean prettyPrint) {
+        if (prettyPrint) {
+            GSON = prettyInit();
+        } else {
+            GSON = defaultInit();
+        }
+    }
+
 }

Added: tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataBase.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataBase.java?rev=1633499&view=auto
==============================================================================
--- tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataBase.java (added)
+++ tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataBase.java Wed Oct 22 00:31:37 2014
@@ -0,0 +1,76 @@
+package org.apache.tika.metadata.serialization;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Arrays;
+
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+import org.apache.tika.metadata.Metadata;
+
+public class JsonMetadataBase {
+
+
+    static Gson defaultInit() {
+        GsonBuilder builder = new GsonBuilder();
+        builder.registerTypeHierarchyAdapter(Metadata.class, new JsonMetadataSerializer());
+        builder.registerTypeHierarchyAdapter(Metadata.class, new JsonMetadataDeserializer());
+        return builder.create();
+    }
+
+    static Gson prettyInit() {
+        GsonBuilder builder = new GsonBuilder();
+        builder.registerTypeHierarchyAdapter(Metadata.class, new SortedJsonMetadataSerializer());
+        builder.registerTypeHierarchyAdapter(Metadata.class, new JsonMetadataDeserializer());
+        builder.setPrettyPrinting();
+        return builder.create();
+    }
+
+    private static class SortedJsonMetadataSerializer extends JsonMetadataSerializer {
+        @Override
+        public String[] getNames(Metadata m) {
+            String[] names = m.names();
+            Arrays.sort(names, new MetadataKeyComparator());
+            return names;
+        }
+
+        private class MetadataKeyComparator implements java.util.Comparator<String> {
+            @Override
+            public int compare(String s1, String s2) {
+                if (s1 == null) {
+                    return 1;
+                } else if (s2 == null) {
+                    return -1;
+                }
+
+                //this is stinky.  This should reference RecursiveParserWrapper.TIKA_CONTENT
+                //but that would require making core a dependency of serialization...
+                //do we want to do that?
+                if (s1.equals("tika:content")) {
+                    if (s2.equals("tika:content")) {
+                        return 0;
+                    }
+                    return 2;
+                } else if (s2.equals("tika:content")) {
+                    return -2;
+                }
+                return s1.compareTo(s2);
+            }
+        }
+    }
+}

Added: tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java?rev=1633499&view=auto
==============================================================================
--- tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java (added)
+++ tika/trunk/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java Wed Oct 22 00:31:37 2014
@@ -0,0 +1,96 @@
+package org.apache.tika.metadata.serialization;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.Reader;
+import java.io.Writer;
+import java.lang.reflect.Type;
+import java.util.List;
+
+import com.google.gson.Gson;
+import com.google.gson.JsonIOException;
+import com.google.gson.reflect.TypeToken;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+public class JsonMetadataList extends JsonMetadataBase {
+    
+    private final static Type listType = new TypeToken<List<Metadata>>(){}.getType();
+    private static Gson GSON;
+    static {
+        GSON = defaultInit();
+    }
+
+    /**
+     * Serializes a Metadata object to Json.  This does not flush or close the writer.
+     * 
+     * @param metadataList list of metadata to write
+     * @param writer writer
+     * @throws org.apache.tika.exception.TikaException if there is an IOException during writing
+     */
+    public static void toJson(List<Metadata> metadataList, Writer writer) throws TikaException {
+        try {
+            GSON.toJson(metadataList, writer);
+        } catch (JsonIOException e) {
+            throw new TikaException(e.getMessage());
+        }
+    }
+        
+    /**
+     * Read metadata from reader.
+     *
+     * @param reader
+     * @return Metadata or null if nothing could be read from the reader
+     * @throws org.apache.tika.exception.TikaException in case of parse failure by Gson or IO failure with Reader
+     */
+    public static List<Metadata> fromJson(Reader reader) throws TikaException {
+        List<Metadata> ms = null;
+        if (reader == null) {
+            return ms;
+        }
+        try {
+            ms = GSON.fromJson(reader, listType);
+        } catch (com.google.gson.JsonParseException e){
+            //covers both io and parse exceptions
+            throw new TikaException(e.getMessage());
+        }
+        return ms;
+    }
+
+    /**
+     * Enables setting custom configurations on Gson.  Remember to register
+     * a serializer and a deserializer for Metadata.  This does a literal set
+     * and does not add the default serializer and deserializers.
+     *
+     * @param gson
+     */
+    public static void setGson(Gson gson) {
+        GSON = gson;
+    }
+
+    public static void setPrettyPrinting(boolean prettyPrint) {
+        if (prettyPrint) {
+            GSON = prettyInit();
+        } else {
+            GSON = defaultInit();
+        }
+    }
+
+
+}

Added: tika/trunk/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonMetadataListTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonMetadataListTest.java?rev=1633499&view=auto
==============================================================================
--- tika/trunk/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonMetadataListTest.java (added)
+++ tika/trunk/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonMetadataListTest.java Wed Oct 22 00:31:37 2014
@@ -0,0 +1,123 @@
+package org.apache.tika.metadata.serialization;
+
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+import java.io.StringReader;
+import java.io.StringWriter;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.tika.metadata.Metadata;
+import org.junit.Test;
+
+import static junit.framework.Assert.assertTrue;
+import static junit.framework.TestCase.assertNull;
+import static org.junit.Assert.assertEquals;
+
+public class JsonMetadataListTest {
+
+
+    @Test
+    public void testListBasic() throws Exception {
+        Metadata m1 = new Metadata();
+        m1.add("k1", "v1");
+        m1.add("k1", "v2");
+        m1.add("k1", "v3");
+        m1.add("k1", "v4");
+        m1.add("k1", "v4");
+        m1.add("k2", "v1");
+
+        Metadata m2 = new Metadata();
+        m2.add("k3", "v1");
+        m2.add("k3", "v2");
+        m2.add("k3", "v3");
+        m2.add("k3", "v4");
+        m2.add("k3", "v4");
+        m2.add("k4", "v1");
+
+        List<Metadata> metadataList = new LinkedList<Metadata>();
+        metadataList.add(m1);
+        metadataList.add(m2);
+        StringWriter writer = new StringWriter();
+        JsonMetadataList.toJson(metadataList, writer);
+        List<Metadata> deserialized = JsonMetadataList.fromJson(new StringReader(writer.toString()));
+        assertEquals(metadataList, deserialized);
+    }
+
+    @Test
+    public void testListNull() throws Exception {
+        StringWriter writer = new StringWriter();
+        JsonMetadataList.toJson(null, writer);
+        assertEquals("", writer.toString().trim());
+
+        List<Metadata> m = JsonMetadataList.fromJson(null);
+        assertNull(m);
+    }
+
+    @Test
+    public void testListCorrupted() throws Exception {
+        String json = "[{\"k1\":[\"v1\",\"v2\",\"v3\",\"v4\",\"v4\"],\"k2\":\"v1\"}," +
+                "\"k3\":[\"v1\",\"v2\",\"v3\",\"v4\",\"v4\"],\"k4\":\"v1\"}]";
+        List<Metadata> m = JsonMetadataList.fromJson(null);
+        assertNull(m);
+    }
+
+    @Test
+    public void testPrettyPrint() throws Exception {
+        Metadata m1 = new Metadata();
+        m1.add("tika:content", "this is the content");
+        m1.add("zk1", "v1");
+        m1.add("zk1", "v2");
+        m1.add("zk1", "v3");
+        m1.add("zk1", "v4");
+        m1.add("zk1", "v4");
+        m1.add("zk2", "v1");
+
+        Metadata m2 = new Metadata();
+        m2.add("k3", "v1");
+        m2.add("k3", "v2");
+        m2.add("k3", "v3");
+        m2.add("k3", "v4");
+        m2.add("k3", "v4");
+        m2.add("k4", "v1");
+
+        List<Metadata> metadataList = new LinkedList<Metadata>();
+        metadataList.add(m1);
+        metadataList.add(m2);
+        StringWriter writer = new StringWriter();
+        JsonMetadataList.toJson(metadataList, writer);
+        assertTrue(writer.toString().startsWith("[{\"tika:content\":\"this is the content\",\"zk1\":[\"v1\",\"v2\","));
+        writer = new StringWriter();
+        JsonMetadataList.setPrettyPrinting(true);
+        JsonMetadataList.toJson(metadataList, writer);
+        assertTrue(writer.toString().startsWith("[\n" +
+                "  {\n" +
+                "    \"zk1\": [\n" +
+                "      \"v1\",\n" +
+                "      \"v2\","));
+        assertTrue(writer.toString().contains("    \"zk2\": \"v1\",\n" +
+                "    \"tika:content\": \"this is the content\"\n" +
+                "  },"));
+
+        //now set it back to false
+        JsonMetadataList.setPrettyPrinting(false);
+        writer = new StringWriter();
+        JsonMetadataList.toJson(metadataList, writer);
+        assertTrue(writer.toString().startsWith("[{\"tika:content\":\"this is the content\",\"zk1\":[\"v1\",\"v2\","));
+    }
+}

Modified: tika/trunk/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonMetadataTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonMetadataTest.java?rev=1633499&r1=1633498&r2=1633499&view=diff
==============================================================================
--- tika/trunk/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonMetadataTest.java (original)
+++ tika/trunk/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonMetadataTest.java Wed Oct 22 00:31:37 2014
@@ -57,6 +57,24 @@ public class JsonMetadataTest {
 
         //test that this really is 6 Chinese characters
         assertEquals(6, deserialized.get("alma_mater").length());
+
+        //now test pretty print;
+        writer = new StringWriter();
+        JsonMetadata.setPrettyPrinting(true);
+        JsonMetadata.toJson(metadata, writer);
+        assertTrue(writer.toString().contains(
+                "  \"json_escapes\": \"the: \\\"quick\\\" brown, fox\",\n" +
+                "  \"k1\": [\n" +
+                "    \"v1\",\n" +
+                "    \"v2\"\n" +
+                "  ],\n" +
+                "  \"k3\": [\n" +
+                "    \"v3\",\n" +
+                "    \"v3\"\n" +
+                "  ],\n" +
+                "  \"k4\": \"500,000\",\n" +
+                "  \"url\": \"/myApp/myAction.html?method\\u003drouter\\u0026cmd\\u003d1\"\n" +
+                "}"));
     }
     
     @Test
@@ -111,5 +129,4 @@ public class JsonMetadataTest {
         Metadata deserialized = JsonMetadata.fromJson(new StringReader(writer.toString()));
         assertEquals(m, deserialized);        
     }
-
 }