You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/05/10 15:20:31 UTC
[tika] branch main updated: TIKA-4034 (#1125)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 4ce287ab2 TIKA-4034 (#1125)
4ce287ab2 is described below
commit 4ce287ab2bcab17d8b2e94c01dda75474345e479
Author: Tim Allison <ta...@apache.org>
AuthorDate: Wed May 10 11:20:24 2023 -0400
TIKA-4034 (#1125)
* TIKA-4034 -- enable configuration of pretty print in FileSystemEmitter
---
CHANGES.txt | 2 ++
.../test/java/org/apache/tika/cli/TikaCLITest.java | 22 ++++++++++++++++++++--
.../tika/pipes/emitter/fs/FileSystemEmitter.java | 12 +++++++++++-
.../metadata/serialization/JsonMetadataList.java | 18 +++++++++++++++---
4 files changed, 48 insertions(+), 6 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index cfe44d844..4692e6b2c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 2.8.1 - ????
+ * Allow pretty printing in FileSystemEmitter (TIKA-4034).
+
Release 2.8.0 - 5/9/2023
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 03544163d..e6c5c2296 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -30,6 +30,7 @@ import java.net.URI;
import java.nio.file.Files;
import java.nio.file.Path;
+import org.apache.commons.io.FileUtils;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
@@ -71,8 +72,8 @@ public class TikaCLITest {
"</basePath>" + "</fetcher>" + "</fetchers>" + "<emitters>" +
"<emitter class=\"org.apache.tika.pipes.emitter.fs.FileSystemEmitter\">" +
"<name>fse</name>" + "<basePath>" + ASYNC_OUTPUT_DIR.toAbsolutePath() +
- "</basePath>" + "</emitter>" + "</emitters>" + "<pipesIterator " +
- "class=\"org.apache.tika.pipes.pipesiterator.fs.FileSystemPipesIterator\">" +
+ "</basePath>" + "<prettyPrint>true</prettyPrint>" + "</emitter>" + "</emitters>" +
+ "<pipesIterator class=\"org.apache.tika.pipes.pipesiterator.fs.FileSystemPipesIterator\">" +
"<basePath>" + TEST_DATA_FILE.getAbsolutePath() + "</basePath>" +
"<fetcherName>fsf</fetcherName>" + "<emitterName>fse</emitterName>" +
"</pipesIterator>" + "</properties>";
@@ -595,12 +596,29 @@ public class TikaCLITest {
int json = 0;
for (File f : ASYNC_OUTPUT_DIR.toFile().listFiles()) {
if (f.getName().endsWith(".json")) {
+ //check one file for pretty print
+ if (f.getName().equals("coffee.xls.json")) {
+ checkForPrettyPrint(f);
+ }
json++;
}
}
assertEquals(17, json);
}
+ private void checkForPrettyPrint(File f) throws IOException {
+ String json = FileUtils.readFileToString(f, UTF_8);
+ int previous = json.indexOf("Content-Length");
+ assertTrue(previous > -1);
+ for (String k : new String[]{"Content-Type", "dc:creator",
+ "dcterms:created", "dcterms:modified", "X-TIKA:content\""}) {
+ int i = json.indexOf(k);
+ assertTrue( i > -1, "should have found " + k);
+ assertTrue(i > previous, "bad order: " + k + " at " + i + " not less than " + previous);
+ previous = i;
+ }
+ }
+
/**
* reset outContent and errContent if they are not empty
diff --git a/tika-pipes/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java b/tika-pipes/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
index 47c517721..a90c5e509 100644
--- a/tika-pipes/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
+++ b/tika-pipes/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/pipes/emitter/fs/FileSystemEmitter.java
@@ -56,6 +56,9 @@ import org.apache.tika.pipes.emitter.TikaEmitterException;
* options ('skip', 'replace', 'exception')
* default is 'exception' -->
* <param name="onExists" type="string">skip</param>
+ * <!-- optional; whether or not to pretty print the output
+ * default is false -->
+ * <param name="prettyPrint" type="boolean">true</param>
* </params>
* </emitter>
* </emitters>
@@ -67,6 +70,8 @@ public class FileSystemEmitter extends AbstractEmitter implements StreamEmitter
private String fileExtension = "json";
private ON_EXISTS onExists = ON_EXISTS.EXCEPTION;
+ private boolean prettyPrint = false;
+
@Override
public void emit(String emitKey, List<Metadata> metadataList)
throws IOException, TikaEmitterException {
@@ -88,7 +93,7 @@ public class FileSystemEmitter extends AbstractEmitter implements StreamEmitter
Files.createDirectories(output.getParent());
}
try (Writer writer = Files.newBufferedWriter(output, StandardCharsets.UTF_8)) {
- JsonMetadataList.toJson(metadataList, writer);
+ JsonMetadataList.toJson(metadataList, writer, prettyPrint);
}
}
@@ -133,6 +138,11 @@ public class FileSystemEmitter extends AbstractEmitter implements StreamEmitter
}
}
+ @Field
+ public void setPrettyPrint(boolean prettyPrint) {
+ this.prettyPrint = prettyPrint;
+ }
+
@Override
public void emit(String path, InputStream inputStream, Metadata userMetadata)
throws IOException, TikaEmitterException {
diff --git a/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java b/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java
index 1b8968dbf..e008e0564 100644
--- a/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java
+++ b/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonMetadataList.java
@@ -40,26 +40,38 @@ public class JsonMetadataList {
*
* @param metadataList list of metadata to write
* @param writer writer
+ * @param prettyPrint whether or not to pretty print the output
* @throws org.apache.tika.exception.TikaException if there is an IOException during writing
*/
- public static void toJson(List<Metadata> metadataList, Writer writer) throws IOException {
+ public static void toJson(List<Metadata> metadataList, Writer writer, boolean prettyPrint) throws IOException {
if (metadataList == null) {
writer.write("null");
return;
}
try (JsonGenerator jsonGenerator = new JsonFactory()
.createGenerator(new CloseShieldWriter(writer))) {
- if (PRETTY_PRINT) {
+ if (prettyPrint) {
jsonGenerator.useDefaultPrettyPrinter();
}
jsonGenerator.writeStartArray();
for (Metadata m : metadataList) {
- JsonMetadata.writeMetadataObject(m, jsonGenerator, PRETTY_PRINT);
+ JsonMetadata.writeMetadataObject(m, jsonGenerator, prettyPrint);
}
jsonGenerator.writeEndArray();
}
}
+ /**
+ * Serializes a Metadata object to Json. This does not flush or close the writer.
+ *
+ * @param metadataList list of metadata to write
+ * @param writer writer
+ * @throws org.apache.tika.exception.TikaException if there is an IOException during writing
+ */
+ public static void toJson(List<Metadata> metadataList, Writer writer) throws IOException {
+ toJson(metadataList, writer, PRETTY_PRINT);
+ }
+
/**
* Read metadata from reader. This does not close the reader
*