You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/05/10 14:20:21 UTC

[tika] branch TIKA-4034 created (now d4d633388)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4034
in repository https://gitbox.apache.org/repos/asf/tika.git


      at d4d633388 TIKA-4034 -- enable configuration of pretty print in FileSystemEmitter

This branch includes the following new commits:

     new d4d633388 TIKA-4034 -- enable configuration of pretty print in FileSystemEmitter

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: TIKA-4034 -- enable configuration of pretty print in FileSystemEmitter

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4034
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d4d6333885638c0e1fbcb75342929c0801abdb1e
Author: tallison <ta...@apache.org>
AuthorDate: Wed May 10 10:19:58 2023 -0400

    TIKA-4034 -- enable configuration of pretty print in FileSystemEmitter
---
 CHANGES.txt                                        |  2 ++
 .../test/java/org/apache/tika/cli/TikaCLITest.java | 22 ++++++++++++++++++++--
 .../tika/pipes/emitter/fs/FileSystemEmitter.java   | 12 +++++++++++-
 .../metadata/serialization/JsonMetadataList.java   | 18 +++++++++++++++---
 4 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index cfe44d844..4692e6b2c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 2.8.1 - ????
 
+   * Allow pretty printing in FileSystemEmitter (TIKA-4034).
+
 
 Release 2.8.0 - 5/9/2023
 
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 03544163d..170ccb500 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -30,6 +30,7 @@ import java.net.URI;
 import java.nio.file.Files;
 import java.nio.file.Path;
 
+import org.apache.commons.io.FileUtils;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.BeforeEach;
@@ -71,8 +72,8 @@ public class TikaCLITest {
                 "</basePath>" + "</fetcher>" + "</fetchers>" + "<emitters>" +
                 "<emitter class=\"org.apache.tika.pipes.emitter.fs.FileSystemEmitter\">" +
                 "<name>fse</name>" + "<basePath>" + ASYNC_OUTPUT_DIR.toAbsolutePath() +
-                "</basePath>" + "</emitter>" + "</emitters>" + "<pipesIterator " +
-                "class=\"org.apache.tika.pipes.pipesiterator.fs.FileSystemPipesIterator\">" +
+                "</basePath>" + "<prettyPrint>true</prettyPrint>" + "</emitter>" + "</emitters>" +
+                "<pipesIterator class=\"org.apache.tika.pipes.pipesiterator.fs.FileSystemPipesIterator\">" +
                 "<basePath>" + TEST_DATA_FILE.getAbsolutePath() + "</basePath>" +
                 "<fetcherName>fsf</fetcherName>" + "<emitterName>fse</emitterName>" +
                 "</pipesIterator>" + "</properties>";
@@ -595,12 +596,29 @@ public class TikaCLITest {
         int json = 0;
         for (File f : ASYNC_OUTPUT_DIR.toFile().listFiles()) {
             if (f.getName().endsWith(".json")) {
+                //check first file for pretty print
+                if (json == 0) {
+                    checkForPrettyPrint(f);
+                }
                 json++;
             }
         }
         assertEquals(17, json);
     }
 
+    private void checkForPrettyPrint(File f) throws IOException {
+        String json = FileUtils.readFileToString(f, UTF_8);
+        int previous = json.indexOf("Content-Length");
+        assertTrue(previous > -1);
+        for (String k : new String[]{"Content-Type", "dc:creator",
+                "dcterms:created", "dcterms:modified", "X-TIKA:content\""}) {
+            int i = json.indexOf(k);
+            assertTrue( i > -1, "should have found " + k);
+            assertTrue(i > previous, "bad order: " + k + " at " + i + " not less than " + previous);
+            previous = i;
+        }
+    }
+
 
     /**
      * reset outContent and errContent if they are not empty
diff --git a/tika-pipes/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java b/tika-pipes/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
index 47c517721..a90c5e509 100644
--- a/tika-pipes/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
+++ b/tika-pipes/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
@@ -56,6 +56,9 @@ import org.apache.tika.pipes.emitter.TikaEmitterException;
  *                       options ('skip', 'replace', 'exception')
  *                  default is 'exception' --&gt;
  *                  &lt;param name="onExists" type="string"&gt;skip&lt;/param&gt;
+ *                  &lt;!-- optional; whether or not to pretty print the output
+ *                      default is false --&gt;
+ *                     &lt;param name="prettyPrint" type="boolean"&gt;true&lt;/param&gt;
  *              &lt;/params&gt;
  *          &lt;/emitter&gt;
  *      &lt;/emitters&gt;
@@ -67,6 +70,8 @@ public class FileSystemEmitter extends AbstractEmitter implements StreamEmitter
     private String fileExtension = "json";
     private ON_EXISTS onExists = ON_EXISTS.EXCEPTION;
 
+    private boolean prettyPrint = false;
+
     @Override
     public void emit(String emitKey, List<Metadata> metadataList)
             throws IOException, TikaEmitterException {
@@ -88,7 +93,7 @@ public class FileSystemEmitter extends AbstractEmitter implements StreamEmitter
             Files.createDirectories(output.getParent());
         }
         try (Writer writer = Files.newBufferedWriter(output, StandardCharsets.UTF_8)) {
-            JsonMetadataList.toJson(metadataList, writer);
+            JsonMetadataList.toJson(metadataList, writer, prettyPrint);
         }
     }
 
@@ -133,6 +138,11 @@ public class FileSystemEmitter extends AbstractEmitter implements StreamEmitter
         }
     }
 
+    @Field
+    public void setPrettyPrint(boolean prettyPrint) {
+        this.prettyPrint = prettyPrint;
+    }
+
     @Override
     public void emit(String path, InputStream inputStream, Metadata userMetadata)
             throws IOException, TikaEmitterException {
diff --git a/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java b/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java
index 1b8968dbf..e008e0564 100644
--- a/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java
+++ b/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java
@@ -40,26 +40,38 @@ public class JsonMetadataList {
      *
      * @param metadataList list of metadata to write
      * @param writer       writer
+     * @param prettyPrint whether or not to pretty print the output
      * @throws org.apache.tika.exception.TikaException if there is an IOException during writing
      */
-    public static void toJson(List<Metadata> metadataList, Writer writer) throws IOException {
+    public static void toJson(List<Metadata> metadataList, Writer writer, boolean prettyPrint) throws IOException {
         if (metadataList == null) {
             writer.write("null");
             return;
         }
         try (JsonGenerator jsonGenerator = new JsonFactory()
                 .createGenerator(new CloseShieldWriter(writer))) {
-            if (PRETTY_PRINT) {
+            if (prettyPrint) {
                 jsonGenerator.useDefaultPrettyPrinter();
             }
             jsonGenerator.writeStartArray();
             for (Metadata m : metadataList) {
-                JsonMetadata.writeMetadataObject(m, jsonGenerator, PRETTY_PRINT);
+                JsonMetadata.writeMetadataObject(m, jsonGenerator, prettyPrint);
             }
             jsonGenerator.writeEndArray();
         }
     }
 
+    /**
+     * Serializes a Metadata object to Json.  This does not flush or close the writer.
+     *
+     * @param metadataList list of metadata to write
+     * @param writer       writer
+     * @throws org.apache.tika.exception.TikaException if there is an IOException during writing
+     */
+    public static void toJson(List<Metadata> metadataList, Writer writer) throws IOException {
+        toJson(metadataList, writer, PRETTY_PRINT);
+    }
+
     /**
      * Read metadata from reader. This does not close the reader
      *