You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/05/10 15:20:31 UTC

[tika] branch main updated: TIKA-4034 (#1125)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 4ce287ab2 TIKA-4034 (#1125)
4ce287ab2 is described below

commit 4ce287ab2bcab17d8b2e94c01dda75474345e479
Author: Tim Allison <ta...@apache.org>
AuthorDate: Wed May 10 11:20:24 2023 -0400

    TIKA-4034 (#1125)
    
    * TIKA-4034 -- enable configuration of pretty print in FileSystemEmitter
---
 CHANGES.txt                                        |  2 ++
 .../test/java/org/apache/tika/cli/TikaCLITest.java | 22 ++++++++++++++++++++--
 .../tika/pipes/emitter/fs/FileSystemEmitter.java   | 12 +++++++++++-
 .../metadata/serialization/JsonMetadataList.java   | 18 +++++++++++++++---
 4 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index cfe44d844..4692e6b2c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 2.8.1 - ????
 
+   * Allow pretty printing in FileSystemEmitter (TIKA-4034).
+
 
 Release 2.8.0 - 5/9/2023
 
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 03544163d..e6c5c2296 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -30,6 +30,7 @@ import java.net.URI;
 import java.nio.file.Files;
 import java.nio.file.Path;
 
+import org.apache.commons.io.FileUtils;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.BeforeEach;
@@ -71,8 +72,8 @@ public class TikaCLITest {
                 "</basePath>" + "</fetcher>" + "</fetchers>" + "<emitters>" +
                 "<emitter class=\"org.apache.tika.pipes.emitter.fs.FileSystemEmitter\">" +
                 "<name>fse</name>" + "<basePath>" + ASYNC_OUTPUT_DIR.toAbsolutePath() +
-                "</basePath>" + "</emitter>" + "</emitters>" + "<pipesIterator " +
-                "class=\"org.apache.tika.pipes.pipesiterator.fs.FileSystemPipesIterator\">" +
+                "</basePath>" + "<prettyPrint>true</prettyPrint>" + "</emitter>" + "</emitters>" +
+                "<pipesIterator class=\"org.apache.tika.pipes.pipesiterator.fs.FileSystemPipesIterator\">" +
                 "<basePath>" + TEST_DATA_FILE.getAbsolutePath() + "</basePath>" +
                 "<fetcherName>fsf</fetcherName>" + "<emitterName>fse</emitterName>" +
                 "</pipesIterator>" + "</properties>";
@@ -595,12 +596,29 @@ public class TikaCLITest {
         int json = 0;
         for (File f : ASYNC_OUTPUT_DIR.toFile().listFiles()) {
             if (f.getName().endsWith(".json")) {
+                //check one file for pretty print
+                if (f.getName().equals("coffee.xls.json")) {
+                    checkForPrettyPrint(f);
+                }
                 json++;
             }
         }
         assertEquals(17, json);
     }
 
+    private void checkForPrettyPrint(File f) throws IOException {
+        String json = FileUtils.readFileToString(f, UTF_8);
+        int previous = json.indexOf("Content-Length");
+        assertTrue(previous > -1);
+        for (String k : new String[]{"Content-Type", "dc:creator",
+                "dcterms:created", "dcterms:modified", "X-TIKA:content\""}) {
+            int i = json.indexOf(k);
+            assertTrue( i > -1, "should have found " + k);
+            assertTrue(i > previous, "bad order: " + k + " at " + i + " not less than " + previous);
+            previous = i;
+        }
+    }
+
 
     /**
      * reset outContent and errContent if they are not empty
diff --git a/tika-pipes/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java b/tika-pipes/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
index 47c517721..a90c5e509 100644
--- a/tika-pipes/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
+++ b/tika-pipes/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
@@ -56,6 +56,9 @@ import org.apache.tika.pipes.emitter.TikaEmitterException;
  *                       options ('skip', 'replace', 'exception')
  *                  default is 'exception' --&gt;
  *                  &lt;param name="onExists" type="string"&gt;skip&lt;/param&gt;
+ *                  &lt;!-- optional; whether or not to pretty print the output
+ *                      default is false --&gt;
+ *                     &lt;param name="prettyPrint" type="boolean"&gt;true&lt;/param&gt;
  *              &lt;/params&gt;
  *          &lt;/emitter&gt;
  *      &lt;/emitters&gt;
@@ -67,6 +70,8 @@ public class FileSystemEmitter extends AbstractEmitter implements StreamEmitter
     private String fileExtension = "json";
     private ON_EXISTS onExists = ON_EXISTS.EXCEPTION;
 
+    private boolean prettyPrint = false;
+
     @Override
     public void emit(String emitKey, List<Metadata> metadataList)
             throws IOException, TikaEmitterException {
@@ -88,7 +93,7 @@ public class FileSystemEmitter extends AbstractEmitter implements StreamEmitter
             Files.createDirectories(output.getParent());
         }
         try (Writer writer = Files.newBufferedWriter(output, StandardCharsets.UTF_8)) {
-            JsonMetadataList.toJson(metadataList, writer);
+            JsonMetadataList.toJson(metadataList, writer, prettyPrint);
         }
     }
 
@@ -133,6 +138,11 @@ public class FileSystemEmitter extends AbstractEmitter implements StreamEmitter
         }
     }
 
+    @Field
+    public void setPrettyPrint(boolean prettyPrint) {
+        this.prettyPrint = prettyPrint;
+    }
+
     @Override
     public void emit(String path, InputStream inputStream, Metadata userMetadata)
             throws IOException, TikaEmitterException {
diff --git a/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java b/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java
index 1b8968dbf..e008e0564 100644
--- a/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java
+++ b/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java
@@ -40,26 +40,38 @@ public class JsonMetadataList {
      *
      * @param metadataList list of metadata to write
      * @param writer       writer
+     * @param prettyPrint whether or not to pretty print the output
      * @throws org.apache.tika.exception.TikaException if there is an IOException during writing
      */
-    public static void toJson(List<Metadata> metadataList, Writer writer) throws IOException {
+    public static void toJson(List<Metadata> metadataList, Writer writer, boolean prettyPrint) throws IOException {
         if (metadataList == null) {
             writer.write("null");
             return;
         }
         try (JsonGenerator jsonGenerator = new JsonFactory()
                 .createGenerator(new CloseShieldWriter(writer))) {
-            if (PRETTY_PRINT) {
+            if (prettyPrint) {
                 jsonGenerator.useDefaultPrettyPrinter();
             }
             jsonGenerator.writeStartArray();
             for (Metadata m : metadataList) {
-                JsonMetadata.writeMetadataObject(m, jsonGenerator, PRETTY_PRINT);
+                JsonMetadata.writeMetadataObject(m, jsonGenerator, prettyPrint);
             }
             jsonGenerator.writeEndArray();
         }
     }
 
+    /**
+     * Serializes a Metadata object to Json.  This does not flush or close the writer.
+     *
+     * @param metadataList list of metadata to write
+     * @param writer       writer
+     * @throws org.apache.tika.exception.TikaException if there is an IOException during writing
+     */
+    public static void toJson(List<Metadata> metadataList, Writer writer) throws IOException {
+        toJson(metadataList, writer, PRETTY_PRINT);
+    }
+
     /**
      * Read metadata from reader. This does not close the reader
      *