You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/03/13 01:49:59 UTC
(tika) branch TIKA-4207 updated: TIKA-4207 basically working
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4207
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-4207 by this push:
new 3e18b889d TIKA-4207 basically working
3e18b889d is described below
commit 3e18b889ded54e746c4dbd25580d9ac8f73720cf
Author: tallison <ta...@apache.org>
AuthorDate: Tue Mar 12 19:52:26 2024 -0400
TIKA-4207 basically working
---
.../java/org/apache/tika/cli/TikaCLIAsyncTest.java | 16 ++
.../test/java/org/apache/tika/cli/TikaCLITest.java | 2 -
.../AbstractEmbeddedDocumentByteStore.java | 17 ++-
.../extractor/ParsingAndEmbeddedDocExtractor.java | 162 ---------------------
.../ParsingAndEmbeddedDocExtractorFactory.java | 40 -----
.../ParsingEmbeddedDocumentExtractor.java | 41 +++++-
.../apache/tika/parser/RecursiveParserWrapper.java | 2 +
.../java/org/apache/tika/pipes/FetchEmitTuple.java | 2 +-
.../java/org/apache/tika/pipes/PipesServer.java | 144 +++++++++---------
.../extractor/EmbeddedDocumentBytesConfig.java | 73 +++++++++-
.../extractor/EmbeddedDocumentEmitterStore.java | 18 ++-
.../org/apache/tika/parser/mock/MockParser.java | 26 +---
.../org/apache/tika/pipes/PipesServerTest.java | 60 +++++++-
...rocessorTest.java => AsyncChaosMonkeyTest.java} | 2 +-
.../resources/org/apache/tika/pipes/TIKA-4207.xml | 30 ++++
tika-pipes/tika-async-cli/pom.xml | 7 +
.../apache/tika/async/cli/AsyncProcessorTest.java | 138 ++++++++++++++++++
.../apache/tika/async/cli/TikaAsyncCLITest.java | 2 +-
.../test/resources/configs/TIKA-4207-emitter.xml | 35 +++++
.../resources/{ => configs}/tika-config-broken.xml | 0
.../basic_embedded.xml} | 29 ++--
tika-pipes/tika-pipes-iterators/pom.xml | 1 +
.../tika-pipes-iterator-json}/pom.xml | 43 +++---
.../pipesiterator/json/JsonPipesIterator.java | 65 +++++++++
.../pipesiterator/json/TestJsonPipesIterator.java | 85 +++++++++++
.../test-documents/test-with-embedded-bytes.json | 100 +++++++++++++
.../src/test/resources/test-documents/test.json | 100 +++++++++++++
.../metadata/serialization/JsonFetchEmitTuple.java | 42 +++++-
.../serialization/JsonFetchEmitTupleTest.java | 20 +++
29 files changed, 940 insertions(+), 362 deletions(-)
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
index 1f6c8fc2c..d9f6d053f 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.tika.cli;
import static java.nio.charset.StandardCharsets.UTF_8;
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index c160db396..fa16e124a 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -30,9 +30,7 @@ import java.net.URI;
import java.nio.file.Files;
import java.nio.file.Path;
-import org.apache.commons.io.FileUtils;
import org.junit.jupiter.api.AfterEach;
-import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java
index c435a3e6e..cbc1f3411 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentByteStore.java
@@ -30,30 +30,31 @@ public abstract class AbstractEmbeddedDocumentByteStore implements EmbeddedDocum
List<Integer> ids = new ArrayList<>();
- public String getFetchKey(String containerFetchKey, int embeddedId,
- EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig,
- Metadata metadata) {
+ public String getEmitKey(String containerEmitKey, int embeddedId,
+ EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig,
+ Metadata metadata) {
String embeddedIdString = embeddedDocumentBytesConfig.getZeroPadName() > 0 ?
StringUtils.leftPad(Integer.toString(embeddedId),
embeddedDocumentBytesConfig.getZeroPadName(), "0") :
Integer.toString(embeddedId);
- StringBuilder fetchKey = new StringBuilder(containerFetchKey)
- .append(embeddedDocumentBytesConfig.getEmbeddedIdPrefix())
+
+ StringBuilder emitKey = new StringBuilder(containerEmitKey)
+ .append("/").append(containerEmitKey).append(embeddedDocumentBytesConfig.getEmbeddedIdPrefix())
.append(embeddedIdString);
if (embeddedDocumentBytesConfig.getSuffixStrategy().equals(
EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.EXISTING)) {
String fName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
String suffix = FilenameUtils.getSuffixFromPath(fName);
- fetchKey.append(suffix);
+ emitKey.append(suffix);
}
- return fetchKey.toString();
+ return emitKey.toString();
}
@Override
public void add(int id, Metadata metadata, byte[] bytes) throws IOException {
- ids.add(id);
+ ids.add(id);
}
@Override
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingAndEmbeddedDocExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingAndEmbeddedDocExtractor.java
deleted file mode 100644
index d88ec94c4..000000000
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingAndEmbeddedDocExtractor.java
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.extractor;
-
-import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
-
-import java.io.File;
-import java.io.FilenameFilter;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-import org.apache.tika.exception.CorruptedFileException;
-import org.apache.tika.exception.EncryptedDocumentException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.DelegatingParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.ParseRecord;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.EmbeddedContentHandler;
-
-/**
- * Helper class for parsers of package archives or other compound document
- * formats that support embedded or attached component documents.
- *
- * This is intended to both parse the embedded documents and extract
- * the raw bytes from the embedded attachments when possible.
- *
- * See also {@link ParsingEmbeddedDocumentExtractor} and {@link ParserContainerExtractor}.
- *
- * @since 3.0.0
- */
-public class ParsingAndEmbeddedDocExtractor implements EmbeddedDocumentExtractor {
-
- private static final File ABSTRACT_PATH = new File("");
-
- private static final Parser DELEGATING_PARSER = new DelegatingParser();
-
- private boolean writeFileNameToContent = true;
-
- private final ParseContext context;
-
- public ParsingAndEmbeddedDocExtractor(ParseContext context) {
- this.context = context;
- }
-
- public boolean shouldParseEmbedded(Metadata metadata) {
- DocumentSelector selector = context.get(DocumentSelector.class);
- if (selector != null) {
- return selector.select(metadata);
- }
-
- FilenameFilter filter = context.get(FilenameFilter.class);
- if (filter != null) {
- String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
- if (name != null) {
- return filter.accept(ABSTRACT_PATH, name);
- }
- }
-
- return true;
- }
-
- public void parseEmbedded(
- InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml)
- throws SAXException, IOException {
- if (outputHtml) {
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", "class", "class", "CDATA", "package-entry");
- handler.startElement(XHTML, "div", "div", attributes);
- }
-
- String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
- if (writeFileNameToContent && name != null && name.length() > 0 && outputHtml) {
- handler.startElement(XHTML, "h1", "h1", new AttributesImpl());
- char[] chars = name.toCharArray();
- handler.characters(chars, 0, chars.length);
- handler.endElement(XHTML, "h1", "h1");
- }
-
- // Use the delegate parser to parse this entry
- try (TemporaryResources tmp = new TemporaryResources()) {
- final TikaInputStream newStream =
- TikaInputStream.get(CloseShieldInputStream.wrap(stream), tmp, metadata);
- if (stream instanceof TikaInputStream) {
- final Object container = ((TikaInputStream) stream).getOpenContainer();
- if (container != null) {
- newStream.setOpenContainer(container);
- }
- }
- Path p = newStream.getPath();
- storeEmbeddedBytes(p, metadata);
-
- DELEGATING_PARSER.parse(newStream, new EmbeddedContentHandler(new BodyContentHandler(handler)),
- metadata, context);
- } catch (EncryptedDocumentException ede) {
- recordException(ede, context);
- } catch (CorruptedFileException e) {
- //necessary to stop the parse to avoid infinite loops
- //on corrupt sqlite3 files
- throw new IOException(e);
- } catch (TikaException e) {
- recordException(e, context);
- }
-
- if (outputHtml) {
- handler.endElement(XHTML, "div", "div");
- }
- }
-
- private void storeEmbeddedBytes(Path p, Metadata metadata) {
- EmbeddedDocumentByteStore embeddedDocumentByteStore =
- context.get(EmbeddedDocumentByteStore.class);
- int id = metadata.getInt(TikaCoreProperties.EMBEDDED_ID);
- try {
- embeddedDocumentByteStore.add(id, metadata, Files.readAllBytes(p));
- } catch (IOException e) {
- //log, or better, store embdocstore exception
- }
- }
-
- private void recordException(Exception e, ParseContext context) {
- ParseRecord record = context.get(ParseRecord.class);
- if (record == null) {
- return;
- }
- record.addException(e);
- }
-
- public Parser getDelegatingParser() {
- return DELEGATING_PARSER;
- }
-
- public void setWriteFileNameToContent(boolean writeFileNameToContent) {
- this.writeFileNameToContent = writeFileNameToContent;
- }
-}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingAndEmbeddedDocExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingAndEmbeddedDocExtractorFactory.java
deleted file mode 100644
index ca4c6633c..000000000
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingAndEmbeddedDocExtractorFactory.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.extractor;
-
-import org.apache.tika.config.Field;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-
-public class ParsingAndEmbeddedDocExtractorFactory
- implements EmbeddedDocumentExtractorFactory {
-
- private boolean writeFileNameToContent = true;
-
- @Field
- public void setWriteFileNameToContent(boolean writeFileNameToContent) {
- this.writeFileNameToContent = writeFileNameToContent;
- }
-
- @Override
- public EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext) {
- ParsingEmbeddedDocumentExtractor ex =
- new ParsingEmbeddedDocumentExtractor(parseContext);
- ex.setWriteFileNameToContent(writeFileNameToContent);
- return ex;
- }
-}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
index d1b25f17c..46672838b 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
@@ -22,6 +22,8 @@ import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.xml.sax.ContentHandler;
@@ -106,8 +108,12 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
newStream.setOpenContainer(container);
}
}
- DELEGATING_PARSER.parse(newStream, new EmbeddedContentHandler(new BodyContentHandler(handler)),
- metadata, context);
+ EmbeddedDocumentByteStore store = context.get(EmbeddedDocumentByteStore.class);
+ if (store != null) {
+ parseWithBytes(newStream, handler, metadata);
+ } else {
+ parse(newStream, handler, metadata);
+ }
} catch (EncryptedDocumentException ede) {
recordException(ede, context);
} catch (CorruptedFileException e) {
@@ -123,6 +129,37 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
}
}
+ private void parseWithBytes(TikaInputStream stream, ContentHandler handler, Metadata metadata)
+ throws TikaException, IOException, SAXException {
+ Path p = stream.getPath();
+ try {
+ parse(stream, handler, metadata);
+ } finally {
+ storeEmbeddedBytes(p, metadata);
+ }
+ }
+
+ private void parse(TikaInputStream stream, ContentHandler handler, Metadata metadata)
+ throws TikaException, IOException, SAXException {
+ DELEGATING_PARSER.parse(stream,
+ new EmbeddedContentHandler(new BodyContentHandler(handler)),
+ metadata, context);
+ }
+
+ private void storeEmbeddedBytes(Path p, Metadata metadata) {
+ EmbeddedDocumentByteStore embeddedDocumentByteStore =
+ context.get(EmbeddedDocumentByteStore.class);
+ int id = metadata.getInt(TikaCoreProperties.EMBEDDED_ID);
+
+ try {
+ embeddedDocumentByteStore.add(id, metadata, Files.readAllBytes(p));
+ } catch (IOException e) {
+ e.printStackTrace();
+ //log, or better, store embdocstore exception
+ }
+ }
+
+
private void recordException(Exception e, ParseContext context) {
ParseRecord record = context.get(ParseRecord.class);
if (record == null) {
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index e8f029770..3cb78d520 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -223,6 +223,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
@Override
public void parse(InputStream stream, ContentHandler ignore, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
+
//Test to see if we should avoid parsing
if (parserState.recursiveParserWrapperHandler.hasHitMaximumEmbeddedResources()) {
return;
@@ -255,6 +256,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
//so that you can return it back to its state at the end of this parse
ContentHandler preContextHandler = secureContentHandler.handler;
secureContentHandler.updateContentHandler(localHandler);
+
try {
super.parse(stream, secureContentHandler, metadata, context);
} catch (SAXException e) {
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/FetchEmitTuple.java b/tika-core/src/main/java/org/apache/tika/pipes/FetchEmitTuple.java
index c49f3743f..0c0334fd4 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/FetchEmitTuple.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/FetchEmitTuple.java
@@ -20,8 +20,8 @@ import java.io.Serializable;
import java.util.Objects;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig;
import org.apache.tika.pipes.emitter.EmitKey;
+import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig;
import org.apache.tika.pipes.fetcher.FetchKey;
public class FetchEmitTuple implements Serializable {
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
index 94ef58502..851805d06 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
@@ -58,12 +58,11 @@ import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
-import org.apache.tika.pipes.emitter.StreamEmitter;
-import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig;
import org.apache.tika.pipes.emitter.EmitData;
import org.apache.tika.pipes.emitter.EmitKey;
import org.apache.tika.pipes.emitter.Emitter;
import org.apache.tika.pipes.emitter.EmitterManager;
+import org.apache.tika.pipes.emitter.StreamEmitter;
import org.apache.tika.pipes.emitter.TikaEmitterException;
import org.apache.tika.pipes.extractor.EmbeddedDocumentEmitterStore;
import org.apache.tika.pipes.fetcher.FetchKey;
@@ -79,7 +78,7 @@ import org.apache.tika.utils.StringUtils;
/**
* This server is forked from the PipesClient. This class isolates
* parsing from the client to protect the primary JVM.
- *
+ * <p>
* When configuring logging for this class, make absolutely certain
* not to write to STDOUT. This class uses STDOUT to communicate with
* the PipesClient.
@@ -96,22 +95,9 @@ public class PipesServer implements Runnable {
private Detector detector;
public enum STATUS {
- READY,
- CALL,
- PING,
- FAILED_TO_START,
- FETCHER_NOT_FOUND,
- EMITTER_NOT_FOUND,
- FETCHER_INITIALIZATION_EXCEPTION,
- FETCH_EXCEPTION,
- PARSE_SUCCESS,
- PARSE_EXCEPTION_NO_EMIT,
- EMIT_SUCCESS,
- EMIT_SUCCESS_PARSE_EXCEPTION,
- EMIT_EXCEPTION,
- OOM,
- TIMEOUT,
- EMPTY_OUTPUT,
+ READY, CALL, PING, FAILED_TO_START, FETCHER_NOT_FOUND, EMITTER_NOT_FOUND,
+ FETCHER_INITIALIZATION_EXCEPTION, FETCH_EXCEPTION, PARSE_SUCCESS, PARSE_EXCEPTION_NO_EMIT,
+ EMIT_SUCCESS, EMIT_SUCCESS_PARSE_EXCEPTION, EMIT_EXCEPTION, OOM, TIMEOUT, EMPTY_OUTPUT,
INTERMEDIATE_RESULT;
byte getByte() {
@@ -126,8 +112,8 @@ public class PipesServer implements Runnable {
STATUS[] statuses = STATUS.values();
if (i >= statuses.length) {
- throw new IllegalArgumentException("byte with index " +
- i + " must be < " + statuses.length);
+ throw new IllegalArgumentException(
+ "byte with index " + i + " must be < " + statuses.length);
}
return statuses[i];
}
@@ -154,8 +140,8 @@ public class PipesServer implements Runnable {
public PipesServer(Path tikaConfigPath, InputStream in, PrintStream out,
- long maxForEmitBatchBytes,
- long serverParseTimeoutMillis, long serverWaitTimeoutMillis)
+ long maxForEmitBatchBytes, long serverParseTimeoutMillis,
+ long serverWaitTimeoutMillis)
throws IOException, TikaException, SAXException {
this.tikaConfigPath = tikaConfigPath;
this.input = new DataInputStream(in);
@@ -197,7 +183,8 @@ public class PipesServer implements Runnable {
synchronized (lock) {
long elapsed = System.currentTimeMillis() - since;
if (parsing && elapsed > serverParseTimeoutMillis) {
- LOG.warn("timeout server; elapsed {} with {}", elapsed, serverParseTimeoutMillis);
+ LOG.warn("timeout server; elapsed {} with {}", elapsed,
+ serverParseTimeoutMillis);
exit(TIMEOUT_EXIT_CODE);
} else if (!parsing && serverWaitTimeoutMillis > 0 &&
elapsed > serverWaitTimeoutMillis) {
@@ -273,6 +260,7 @@ public class PipesServer implements Runnable {
/**
* returns stack trace if there was a container exception or empty string
* if there was no stacktrace
+ *
* @param t
* @param metadataList
* @return
@@ -286,7 +274,8 @@ public class PipesServer implements Runnable {
}
- private void emit(String taskId, EmitKey emitKey, MetadataListAndEmbeddedBytes parseData,
+ private void emit(String taskId, EmitKey emitKey,
+ boolean isExtractEmbeddedBytes, MetadataListAndEmbeddedBytes parseData,
String parseExceptionStack) {
Emitter emitter = null;
@@ -299,7 +288,8 @@ public class PipesServer implements Runnable {
return;
}
try {
- if (parseData.toBePackagedForStreamEmitter()) {
+ if (isExtractEmbeddedBytes &&
+ parseData.toBePackagedForStreamEmitter()) {
emitContentsAndBytes(emitter, emitKey, parseData);
} else {
emitter.emit(emitKey.getEmitKey(), parseData.getMetadataList());
@@ -322,7 +312,7 @@ public class PipesServer implements Runnable {
private void emitContentsAndBytes(Emitter emitter, EmitKey emitKey,
MetadataListAndEmbeddedBytes parseData) {
- if (! (emitter instanceof StreamEmitter)) {
+ if (!(emitter instanceof StreamEmitter)) {
throw new IllegalArgumentException("The emitter for embedded document byte store must" +
" be a StreamEmitter. I see: " + emitter.getClass());
}
@@ -340,7 +330,8 @@ public class PipesServer implements Runnable {
long start = System.currentTimeMillis();
t = readFetchEmitTuple();
if (LOG.isTraceEnabled()) {
- LOG.trace("timer -- read fetchEmitTuple: {} ms", System.currentTimeMillis() - start);
+ LOG.trace("timer -- read fetchEmitTuple: {} ms",
+ System.currentTimeMillis() - start);
}
start = System.currentTimeMillis();
actuallyParse(t);
@@ -375,6 +366,7 @@ public class PipesServer implements Runnable {
MetadataListAndEmbeddedBytes parseData = null;
try {
+ //this can be null if there is a fetch exception
parseData = parseFromTuple(t, fetcher);
if (LOG.isTraceEnabled()) {
@@ -388,10 +380,10 @@ public class PipesServer implements Runnable {
emitParseData(t, parseData);
} finally {
- if (parseData.hasEmbeddedDocumentByteStore() &&
+ if (parseData != null && parseData.hasEmbeddedDocumentByteStore() &&
parseData.getEmbeddedDocumentByteStore() instanceof Closeable) {
try {
- ((Closeable)parseData.getEmbeddedDocumentByteStore()).close();
+ ((Closeable) parseData.getEmbeddedDocumentByteStore()).close();
} catch (IOException e) {
LOG.warn("problem closing embedded document byte store", e);
}
@@ -404,7 +396,8 @@ public class PipesServer implements Runnable {
String stack = getContainerStacktrace(t, parseData.getMetadataList());
//we need to apply this after we pull out the stacktrace
filterMetadata(parseData.getMetadataList());
- if (StringUtils.isBlank(stack) || t.getOnParseException() == FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT) {
+ if (StringUtils.isBlank(stack) ||
+ t.getOnParseException() == FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT) {
injectUserMetadata(t.getMetadata(), parseData.getMetadataList());
EmitKey emitKey = t.getEmitKey();
if (StringUtils.isBlank(emitKey.getEmitKey())) {
@@ -412,21 +405,20 @@ public class PipesServer implements Runnable {
t.setEmitKey(emitKey);
}
EmitData emitData = new EmitData(t.getEmitKey(), parseData.getMetadataList(), stack);
- if (parseData.toBePackagedForStreamEmitter()) {
- emit(t.getId(), emitKey, parseData, stack);
- if (LOG.isTraceEnabled()) {
- LOG.trace("timer -- emitted: {} ms", System.currentTimeMillis() - start);
- }
- } else if (maxForEmitBatchBytes >= 0 && emitData.getEstimatedSizeBytes() >= maxForEmitBatchBytes) {
- emit(t.getId(), emitKey, parseData, stack);
- if (LOG.isTraceEnabled()) {
- LOG.trace("timer -- emitted: {} ms", System.currentTimeMillis() - start);
- }
+ if (t.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes() &&
+ parseData.toBePackagedForStreamEmitter()) {
+ emit(t.getId(), emitKey, t.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes(),
+ parseData, stack);
+ } else if (maxForEmitBatchBytes >= 0 &&
+ emitData.getEstimatedSizeBytes() >= maxForEmitBatchBytes) {
+ emit(t.getId(), emitKey, t.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes(),
+ parseData, stack);
} else {
+ //send back to the client
write(emitData);
- if (LOG.isTraceEnabled()) {
- LOG.trace("timer -- to write data: {} ms", System.currentTimeMillis() - start);
- }
+ }
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("timer -- emitted: {} ms", System.currentTimeMillis() - start);
}
} else {
write(STATUS.PARSE_EXCEPTION_NO_EMIT, stack);
@@ -452,10 +444,8 @@ public class PipesServer implements Runnable {
write(STATUS.FETCHER_NOT_FOUND, noFetcherMsg);
return null;
} catch (IOException | TikaException e) {
- LOG.warn("Couldn't initialize fetcher for fetch id '" +
- t.getId() + "'", e);
- write(STATUS.FETCHER_INITIALIZATION_EXCEPTION,
- ExceptionUtils.getStackTrace(e));
+ LOG.warn("Couldn't initialize fetcher for fetch id '" + t.getId() + "'", e);
+ write(STATUS.FETCHER_INITIALIZATION_EXCEPTION, ExceptionUtils.getStackTrace(e));
return null;
}
}
@@ -463,12 +453,12 @@ public class PipesServer implements Runnable {
protected MetadataListAndEmbeddedBytes parseFromTuple(FetchEmitTuple t, Fetcher fetcher) {
FetchKey fetchKey = t.getFetchKey();
if (fetchKey.hasRange()) {
- if (! (fetcher instanceof RangeFetcher)) {
+ if (!(fetcher instanceof RangeFetcher)) {
throw new IllegalArgumentException(
"fetch key has a range, but the fetcher is not a range fetcher");
}
Metadata metadata = new Metadata();
- try (InputStream stream = ((RangeFetcher)fetcher).fetch(fetchKey.getFetchKey(),
+ try (InputStream stream = ((RangeFetcher) fetcher).fetch(fetchKey.getFetchKey(),
fetchKey.getRangeStart(), fetchKey.getRangeEnd(), metadata)) {
return parseWithStream(t, stream, metadata);
} catch (SecurityException e) {
@@ -530,39 +520,39 @@ public class PipesServer implements Runnable {
exit(1);
}
- private MetadataListAndEmbeddedBytes parseWithStream(FetchEmitTuple fetchEmitTuple, InputStream stream,
- Metadata metadata) throws TikaConfigException {
+ private MetadataListAndEmbeddedBytes parseWithStream(FetchEmitTuple fetchEmitTuple,
+ InputStream stream, Metadata metadata)
+ throws TikaConfigException {
HandlerConfig handlerConfig = fetchEmitTuple.getHandlerConfig();
List<Metadata> metadataList;
//this adds the EmbeddedDocumentByteStore to the parsecontext
ParseContext parseContext = createParseContext(fetchEmitTuple);
-
if (handlerConfig.getParseMode() == HandlerConfig.PARSE_MODE.RMETA) {
- metadataList = parseRecursive(fetchEmitTuple, handlerConfig, stream, metadata, parseContext);
+ metadataList =
+ parseRecursive(fetchEmitTuple, handlerConfig, stream, metadata, parseContext);
} else {
- metadataList = parseConcatenated(fetchEmitTuple, handlerConfig, stream, metadata, parseContext);
+ metadataList = parseConcatenated(fetchEmitTuple, handlerConfig, stream, metadata,
+ parseContext);
}
return new MetadataListAndEmbeddedBytes(metadataList,
- parseContext.get(EmbeddedDocumentByteStore.class));
+ parseContext.get(EmbeddedDocumentByteStore.class));
}
private ParseContext createParseContext(FetchEmitTuple fetchEmitTuple)
throws TikaConfigException {
ParseContext parseContext = new ParseContext();
- if (fetchEmitTuple.getEmbeddedDocumentBytesConfig() == EmbeddedDocumentBytesConfig.SKIP) {
+ if (! fetchEmitTuple.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes()) {
return parseContext;
}
- if (! StringUtils.isBlank(fetchEmitTuple.getEmbeddedDocumentBytesConfig().getEmitter())) {
+ if (!StringUtils.isBlank(fetchEmitTuple.getEmbeddedDocumentBytesConfig().getEmitter())) {
parseContext.set(EmbeddedDocumentByteStore.class,
new EmbeddedDocumentEmitterStore(fetchEmitTuple.getEmitKey(),
- fetchEmitTuple.getEmbeddedDocumentBytesConfig(),
- emitterManager));
+ fetchEmitTuple.getEmbeddedDocumentBytesConfig(), emitterManager));
} else {
- parseContext.set(EmbeddedDocumentByteStore.class,
- new BasicEmbeddedDocumentByteStore(fetchEmitTuple.getEmbeddedDocumentBytesConfig()));
-
+ parseContext.set(EmbeddedDocumentByteStore.class, new BasicEmbeddedDocumentByteStore(
+ fetchEmitTuple.getEmbeddedDocumentBytesConfig()));
}
return parseContext;
}
@@ -580,6 +570,7 @@ public class PipesServer implements Runnable {
parseContext.set(DocumentSelector.class, new DocumentSelector() {
final int maxEmbedded = handlerConfig.maxEmbeddedResources;
int embedded = 0;
+
@Override
public boolean select(Metadata metadata) {
if (maxEmbedded < 0) {
@@ -625,10 +616,11 @@ public class PipesServer implements Runnable {
//We need to let stacktraces percolate
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
new BasicContentHandlerFactory(handlerConfig.getType(),
- handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(), parseContext),
- handlerConfig.getMaxEmbeddedResources());
+ handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(),
+ parseContext), handlerConfig.getMaxEmbeddedResources());
long start = System.currentTimeMillis();
+
preParse(fetchEmitTuple, stream, metadata, parseContext);
try {
rMetaParser.parse(stream, handler, metadata, parseContext);
@@ -683,9 +675,9 @@ public class PipesServer implements Runnable {
}
if (t.getEmbeddedDocumentBytesConfig() != null &&
- t.getEmbeddedDocumentBytesConfig().isIncludeOriginal()) {
+ t.getEmbeddedDocumentBytesConfig().isIncludeOriginal()) {
EmbeddedDocumentByteStore embeddedDocumentByteStore =
- parseContext.get(EmbeddedDocumentEmitterStore.class);
+ parseContext.get(EmbeddedDocumentByteStore.class);
try {
embeddedDocumentByteStore.add(0, metadata, Files.readAllBytes(tis.getPath()));
} catch (IOException e) {
@@ -747,14 +739,15 @@ public class PipesServer implements Runnable {
this.emitterManager = null;
}
this.autoDetectParser = new AutoDetectParser(this.tikaConfig);
- if (((AutoDetectParser)autoDetectParser).getAutoDetectParserConfig().getDigesterFactory() != null) {
- this.digester = ((AutoDetectParser) autoDetectParser).
- getAutoDetectParserConfig().getDigesterFactory().build();
+ if (((AutoDetectParser) autoDetectParser).getAutoDetectParserConfig()
+ .getDigesterFactory() != null) {
+ this.digester = ((AutoDetectParser) autoDetectParser).getAutoDetectParserConfig()
+ .getDigesterFactory().build();
//override this value because we'll be digesting before parse
- ((AutoDetectParser)autoDetectParser).getAutoDetectParserConfig().getDigesterFactory()
+ ((AutoDetectParser) autoDetectParser).getAutoDetectParserConfig().getDigesterFactory()
.setSkipContainerDocument(true);
}
- this.detector = ((AutoDetectParser)this.autoDetectParser).getDetector();
+ this.detector = ((AutoDetectParser) this.autoDetectParser).getDetector();
this.rMetaParser = new RecursiveParserWrapper(autoDetectParser);
}
@@ -813,7 +806,7 @@ public class PipesServer implements Runnable {
}
}
- private class MetadataListAndEmbeddedBytes {
+ class MetadataListAndEmbeddedBytes {
final List<Metadata> metadataList;
final Optional<EmbeddedDocumentByteStore> embeddedDocumentByteStore;
@@ -834,6 +827,7 @@ public class PipesServer implements Runnable {
/**
* This tests whether there's any type of embedded document store
* ...that, for example, may require closing at the end of the parse.
+ *
* @return
*/
public boolean hasEmbeddedDocumentByteStore() {
@@ -843,13 +837,13 @@ public class PipesServer implements Runnable {
/**
* If the intent is that the metadata and byte store be packaged in a zip
* or similar and emitted via a single stream emitter.
- *
+ * <p>
* This is basically a test that this is not an EmbeddedDocumentEmitterStore.
*
* @return
*/
public boolean toBePackagedForStreamEmitter() {
- return ! (embeddedDocumentByteStore.get() instanceof EmbeddedDocumentEmitterStore);
+ return !(embeddedDocumentByteStore.get() instanceof EmbeddedDocumentEmitterStore);
}
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
index cdf3c77fe..42538ff80 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
@@ -16,12 +16,32 @@
*/
package org.apache.tika.pipes.extractor;
-public class EmbeddedDocumentBytesConfig {
+import java.io.Serializable;
+import java.util.Objects;
+
+public class EmbeddedDocumentBytesConfig implements Serializable {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -3861669115439125268L;
+
public static EmbeddedDocumentBytesConfig SKIP = new EmbeddedDocumentBytesConfig(false);
public enum SUFFIX_STRATEGY {
- NONE, EXISTING, DETECTED
+ NONE, EXISTING, DETECTED;
+
+ public static SUFFIX_STRATEGY parse(String s) {
+ if (s.equalsIgnoreCase("none")) {
+ return NONE;
+ } else if (s.equalsIgnoreCase("existing")) {
+ return EXISTING;
+ } else if (s.equalsIgnoreCase("detected")) {
+ return DETECTED;
+ }
+ throw new IllegalArgumentException("can't parse " + s);
+ }
}
private final boolean extractEmbeddedDocumentBytes;
//TODO -- add these at some point
@@ -90,4 +110,53 @@ public class EmbeddedDocumentBytesConfig {
public void setIncludeOriginal(boolean includeOriginal) {
this.includeOriginal = includeOriginal;
}
+
+ @Override
+ public String toString() {
+ return "EmbeddedDocumentBytesConfig{" + "extractEmbeddedDocumentBytes=" +
+ extractEmbeddedDocumentBytes + ", zeroPadName=" + zeroPadName +
+ ", suffixStrategy=" + suffixStrategy + ", embeddedIdPrefix='" + embeddedIdPrefix +
+ '\'' + ", emitter='" + emitter + '\'' + ", includeOriginal=" + includeOriginal +
+ '}';
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+
+ EmbeddedDocumentBytesConfig that = (EmbeddedDocumentBytesConfig) o;
+
+ if (extractEmbeddedDocumentBytes != that.extractEmbeddedDocumentBytes) {
+ return false;
+ }
+ if (zeroPadName != that.zeroPadName) {
+ return false;
+ }
+ if (includeOriginal != that.includeOriginal) {
+ return false;
+ }
+ if (suffixStrategy != that.suffixStrategy) {
+ return false;
+ }
+ if (!Objects.equals(embeddedIdPrefix, that.embeddedIdPrefix)) {
+ return false;
+ }
+ return Objects.equals(emitter, that.emitter);
+ }
+
+ @Override
+ public int hashCode() {
+ int result = (extractEmbeddedDocumentBytes ? 1 : 0);
+ result = 31 * result + zeroPadName;
+ result = 31 * result + (suffixStrategy != null ? suffixStrategy.hashCode() : 0);
+ result = 31 * result + (embeddedIdPrefix != null ? embeddedIdPrefix.hashCode() : 0);
+ result = 31 * result + (emitter != null ? emitter.hashCode() : 0);
+ result = 31 * result + (includeOriginal ? 1 : 0);
+ return result;
+ }
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java
index ddcca6edf..915b44d44 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentEmitterStore.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.tika.pipes.extractor;
import java.io.Closeable;
@@ -39,7 +55,7 @@ public class EmbeddedDocumentEmitterStore extends AbstractEmbeddedDocumentByteSt
@Override
public void add(int id, Metadata metadata, byte[] bytes) throws IOException {
//intentionally do not call super.add, because we want the ids list to be empty
- String emitKey = getFetchKey(containerEmitKey.getEmitKey(),
+ String emitKey = getEmitKey(containerEmitKey.getEmitKey(),
id, embeddedDocumentBytesConfig, metadata);
try {
diff --git a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java
index 0051a7740..de464bca5 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java
@@ -45,6 +45,7 @@ import com.martensigwart.fakeload.FakeLoadBuilder;
import com.martensigwart.fakeload.FakeLoadExecutor;
import com.martensigwart.fakeload.FakeLoadExecutors;
import com.martensigwart.fakeload.MemoryUnit;
+import org.apache.commons.io.input.CloseShieldInputStream;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
@@ -54,7 +55,7 @@ import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
@@ -120,7 +121,7 @@ public class MockParser implements Parser {
Document doc = null;
try {
DocumentBuilder docBuilder = context.getDocumentBuilder();
- doc = docBuilder.parse(stream);
+ doc = docBuilder.parse(new CloseShieldInputStream(stream));
} catch (SAXException e) {
//to distinguish between SAX on read vs SAX while writing
throw new IOException(e);
@@ -258,29 +259,16 @@ public class MockParser implements Parser {
}
String embeddedText = action.getTextContent();
- EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(context);
+ EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+
Metadata m = new Metadata();
m.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName);
if (!"".equals(contentType)) {
m.set(Metadata.CONTENT_TYPE, contentType);
}
- InputStream is = new ByteArrayInputStream(embeddedText.getBytes(UTF_8));
-
- extractor.parseEmbedded(is, new EmbeddedContentHandler(handler), m, true);
-
-
- }
-
- protected EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) {
- EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class);
- if (extractor == null) {
- Parser p = context.get(Parser.class);
- if (p == null) {
- context.set(Parser.class, new MockParser());
- }
- extractor = new ParsingEmbeddedDocumentExtractor(context);
+ try (InputStream is = new ByteArrayInputStream(embeddedText.getBytes(UTF_8))) {
+ extractor.parseEmbedded(is, new EmbeddedContentHandler(handler), m, true);
}
- return extractor;
}
private void print(Node action, String name) throws IOException {
diff --git a/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java b/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java
index 92d8c5c11..857bf485f 100644
--- a/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java
@@ -22,8 +22,9 @@ import java.io.PrintStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
-import java.util.List;
+import java.nio.file.Paths;
+import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
@@ -33,6 +34,7 @@ import org.junit.jupiter.api.io.TempDir;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.pipes.emitter.EmitKey;
+import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig;
import org.apache.tika.pipes.fetcher.FetchKey;
import org.apache.tika.pipes.fetcher.Fetcher;
import org.apache.tika.pipes.fetcher.FetcherManager;
@@ -69,8 +71,60 @@ public class PipesServerTest extends TikaTest {
new FetchKey("fs", "mock.xml"),
new EmitKey("", ""));
Fetcher fetcher = FetcherManager.load(tikaConfig).getFetcher();
- List<Metadata> metadataList = pipesServer.parseFromTuple(fetchEmitTuple, fetcher);
+ PipesServer.MetadataListAndEmbeddedBytes
+ parseData = pipesServer.parseFromTuple(fetchEmitTuple, fetcher);
assertEquals("5f3b924303e960ce35d7f705e91d3018dd110a9c3cef0546a91fe013d6dad6fd",
- metadataList.get(0).get("X-TIKA:digest:SHA-256"));
+ parseData.metadataList.get(0).get("X-TIKA:digest:SHA-256"));
+ }
+
+ @Test
+ public void testEmbeddedStreamEmitter() throws Exception {
+ Path tmp = Paths.get("/home/tallison/Desktop/tmp");
+ if (Files.isDirectory(tmp)) {
+ FileUtils.deleteDirectory(tmp.toFile());
+ }
+ Files.createDirectories(tmp);
+ Path tikaConfig = tmp.resolve("tika-config.xml");
+
+ String xml = IOUtils.toString(
+ PipesServerTest.class.getResourceAsStream("TIKA-4207.xml"),
+ StandardCharsets.UTF_8);
+ xml = xml.replace("BASE_PATH", tmp.toAbsolutePath().toString());
+ Files.write(tikaConfig, xml.getBytes(StandardCharsets.UTF_8));
+
+ Files.copy(PipesServerTest.class.getResourceAsStream("/test-documents/basic_embedded.xml"),
+ tmp.resolve("mock.xml"));
+
+ PipesServer pipesServer = new PipesServer(tikaConfig,
+ new UnsynchronizedByteArrayInputStream(new byte[0]),
+ new PrintStream(new UnsynchronizedByteArrayOutputStream(), true,
+ StandardCharsets.UTF_8.name()),
+ -1, 30000, 30000);
+
+ pipesServer.initializeResources();
+ EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig =
+ new EmbeddedDocumentBytesConfig(true);
+ embeddedDocumentBytesConfig.setIncludeOriginal(true);
+
+ FetchEmitTuple fetchEmitTuple = new FetchEmitTuple("id",
+ new FetchKey("fs", "mock.xml"),
+ new EmitKey("", ""), new Metadata(),
+ HandlerConfig.DEFAULT_HANDLER_CONFIG, FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT,
+ embeddedDocumentBytesConfig);
+ Fetcher fetcher = FetcherManager.load(tikaConfig).getFetcher();
+ PipesServer.MetadataListAndEmbeddedBytes
+ parseData = pipesServer.parseFromTuple(fetchEmitTuple, fetcher);
+ assertEquals(2, parseData.metadataList.size());
+
+ byte[] bytes0 = parseData.getEmbeddedDocumentByteStore().getDocument(0);
+ byte[] bytes1 = parseData.getEmbeddedDocumentByteStore().getDocument(1);
+
+ assertContains("is to trigger mock on the embedded",
+ new String(bytes0, StandardCharsets.UTF_8));
+
+ assertContains("embeddedAuthor</metadata>",
+ new String(bytes1, StandardCharsets.UTF_8));
+ assertEquals("fdaa937c96d1ed010b8d307ccddf9d11c3b48db732a8771eaafe99d59e076d0a",
+ parseData.metadataList.get(0).get("X-TIKA:digest:SHA-256"));
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/pipes/async/AsyncProcessorTest.java b/tika-core/src/test/java/org/apache/tika/pipes/async/AsyncChaosMonkeyTest.java
similarity index 99%
rename from tika-core/src/test/java/org/apache/tika/pipes/async/AsyncProcessorTest.java
rename to tika-core/src/test/java/org/apache/tika/pipes/async/AsyncChaosMonkeyTest.java
index 0277bc11d..4522a2ea1 100644
--- a/tika-core/src/test/java/org/apache/tika/pipes/async/AsyncProcessorTest.java
+++ b/tika-core/src/test/java/org/apache/tika/pipes/async/AsyncChaosMonkeyTest.java
@@ -40,7 +40,7 @@ import org.apache.tika.pipes.fetcher.FetchKey;
import org.apache.tika.pipes.pipesiterator.PipesIterator;
import org.apache.tika.utils.ProcessUtils;
-public class AsyncProcessorTest {
+public class AsyncChaosMonkeyTest {
private final String OOM = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + "<mock>" +
"<throw class=\"java.lang.OutOfMemoryError\">oom message</throw>\n</mock>";
diff --git a/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207.xml b/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207.xml
new file mode 100644
index 000000000..9f37ad0fe
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <autoDetectParserConfig>
+ <digesterFactory class="org.apache.tika.pipes.async.MockDigesterFactory">
+ <skipContainerDocument>false</skipContainerDocument>
+ </digesterFactory>
+ </autoDetectParserConfig>
+ <fetchers>
+ <fetcher class="org.apache.tika.pipes.fetcher.fs.FileSystemFetcher">
+ <name>fs</name>
+ <basePath>BASE_PATH</basePath>
+ </fetcher>
+ </fetchers>
+</properties>
\ No newline at end of file
diff --git a/tika-pipes/tika-async-cli/pom.xml b/tika-pipes/tika-async-cli/pom.xml
index db2966136..239cf22c7 100644
--- a/tika-pipes/tika-async-cli/pom.xml
+++ b/tika-pipes/tika-async-cli/pom.xml
@@ -37,6 +37,13 @@
<artifactId>tika-core</artifactId>
<version>${project.version}</version>
</dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
<!-- logging -->
<dependency>
<groupId>org.apache.logging.log4j</groupId>
diff --git a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
new file mode 100644
index 000000000..d5a6527f6
--- /dev/null
+++ b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.async.cli;
+
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.pipes.FetchEmitTuple;
+import org.apache.tika.pipes.HandlerConfig;
+import org.apache.tika.pipes.async.AsyncProcessor;
+import org.apache.tika.pipes.emitter.EmitKey;
+import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig;
+import org.apache.tika.pipes.fetcher.FetchKey;
+import org.apache.tika.pipes.pipesiterator.PipesIterator;
+
+/**
+ * This should be in tika-core, but we want to avoid a dependency mess with tika-serialization
+ */
+public class AsyncProcessorTest extends TikaTest {
+ //TODO -- integrate json pipes iterator and run with AyncProcessor.main
+ @TempDir
+ private Path basedir;
+ private Path inputDir;
+
+ private Path bytesDir;
+
+ private Path jsonDir;
+
+ private Path configDir;
+
+ @BeforeEach
+ public void setUp() throws IOException {
+ inputDir = basedir.resolve("input");
+
+ bytesDir = basedir.resolve("bytes");
+
+ jsonDir = basedir.resolve("json");
+
+ configDir = basedir.resolve("config");
+ Path tikaConfig = configDir.resolve("tika-config.xml");
+
+ Files.createDirectories(basedir);
+ Files.createDirectories(configDir);
+ Files.createDirectories(inputDir);
+
+ String xml = IOUtils.toString(
+ AsyncProcessorTest.class.getResourceAsStream("/configs/TIKA-4207-emitter.xml"),
+ StandardCharsets.UTF_8);
+ //do stuff to xml
+ xml = xml.replace("BASE_PATH", inputDir.toAbsolutePath().toString());
+ xml = xml.replace("JSON_PATH", jsonDir.toAbsolutePath().toString());
+ xml = xml.replace("BYTES_PATH", bytesDir.toAbsolutePath().toString());
+
+ Files.writeString(tikaConfig, xml, StandardCharsets.UTF_8);
+
+ Path mock = inputDir.resolve("mock.xml");
+ try (OutputStream os = Files.newOutputStream(mock)) {
+ IOUtils.copy(getClass().getResourceAsStream("/test-documents/basic_embedded.xml"),
+ os);
+ }
+ }
+
+ @Test
+ public void testBasic() throws Exception {
+// TikaAsyncCLI cli = new TikaAsyncCLI();
+ // cli.main(new String[]{ configDir.resolve("tika-config.xml").toAbsolutePath().toString()});
+ AsyncProcessor processor = new AsyncProcessor(configDir.resolve("tika-config.xml"));
+
+ EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig =
+ new EmbeddedDocumentBytesConfig(true);
+ embeddedDocumentBytesConfig.setIncludeOriginal(true);
+ embeddedDocumentBytesConfig.setEmitter("bytes");
+
+ FetchEmitTuple t = new FetchEmitTuple("myId-1",
+ new FetchKey("fs", "mock.xml"),
+ new EmitKey("json", "emit-1"),
+ new Metadata(), HandlerConfig.DEFAULT_HANDLER_CONFIG,
+ FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT, embeddedDocumentBytesConfig);
+
+ processor.offer(t, 1000);
+
+ for (int i = 0; i < 10; i++) {
+ processor.offer(PipesIterator.COMPLETED_SEMAPHORE, 1000);
+ }
+ //TODO clean this up
+ while (processor.checkActive()) {
+ Thread.sleep(100);
+ }
+ processor.close();
+
+ String container = Files.readString(bytesDir.resolve("emit-1-0"));
+ assertContains("\"dc:creator\">Nikolai Lobachevsky", container);
+
+ String xmlEmbedded = Files.readString(bytesDir.resolve("emit-1-1"));
+ assertContains("name=\"dc:creator\"", xmlEmbedded);
+ assertContains(">embeddedAuthor</metadata>", xmlEmbedded);
+
+ List<Metadata> metadataList;
+ try (BufferedReader reader = Files.newBufferedReader(jsonDir.resolve("emit-1.json"))) {
+ metadataList = JsonMetadataList.fromJson(reader);
+ }
+ assertEquals(2, metadataList.size());
+ assertContains("main_content", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+ assertContains("some_embedded_content",
+ metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
+ }
+}
diff --git a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaAsyncCLITest.java b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaAsyncCLITest.java
index fc6694c74..08c962f10 100644
--- a/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaAsyncCLITest.java
+++ b/tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaAsyncCLITest.java
@@ -28,7 +28,7 @@ import org.apache.tika.exception.TikaConfigException;
public class TikaAsyncCLITest {
@Test
public void testCrash() throws Exception {
- Path config = getPath("/tika-config-broken.xml");
+ Path config = getPath("/configs/tika-config-broken.xml");
assertThrows(TikaConfigException.class,
() -> TikaAsyncCLI.main(
new String[] {
diff --git a/tika-pipes/tika-async-cli/src/test/resources/configs/TIKA-4207-emitter.xml b/tika-pipes/tika-async-cli/src/test/resources/configs/TIKA-4207-emitter.xml
new file mode 100644
index 000000000..5391c8496
--- /dev/null
+++ b/tika-pipes/tika-async-cli/src/test/resources/configs/TIKA-4207-emitter.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <fetchers>
+ <fetcher class="org.apache.tika.pipes.fetcher.fs.FileSystemFetcher">
+ <name>fs</name>
+ <basePath>BASE_PATH</basePath>
+ </fetcher>
+ </fetchers>
+ <emitters>
+ <emitter class="org.apache.tika.pipes.emitter.fs.FileSystemEmitter">
+ <name>json</name>
+ <basePath>JSON_PATH</basePath>
+ </emitter>
+ <emitter class="org.apache.tika.pipes.emitter.fs.FileSystemEmitter">
+ <name>bytes</name>
+ <basePath>BYTES_PATH</basePath>
+ </emitter>
+ </emitters>
+</properties>
\ No newline at end of file
diff --git a/tika-pipes/tika-async-cli/src/test/resources/tika-config-broken.xml b/tika-pipes/tika-async-cli/src/test/resources/configs/tika-config-broken.xml
similarity index 100%
copy from tika-pipes/tika-async-cli/src/test/resources/tika-config-broken.xml
copy to tika-pipes/tika-async-cli/src/test/resources/configs/tika-config-broken.xml
diff --git a/tika-pipes/tika-async-cli/src/test/resources/tika-config-broken.xml b/tika-pipes/tika-async-cli/src/test/resources/test-documents/basic_embedded.xml
similarity index 59%
rename from tika-pipes/tika-async-cli/src/test/resources/tika-config-broken.xml
rename to tika-pipes/tika-async-cli/src/test/resources/test-documents/basic_embedded.xml
index 5ee379e6f..7536a1603 100644
--- a/tika-pipes/tika-async-cli/src/test/resources/tika-config-broken.xml
+++ b/tika-pipes/tika-async-cli/src/test/resources/test-documents/basic_embedded.xml
@@ -1,4 +1,5 @@
<?xml version="1.0" encoding="UTF-8" ?>
+
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
@@ -17,16 +18,18 @@
specific language governing permissions and limitations
under the License.
-->
-<properties>
- <fetchers>
- <fetcher class="org.apache.tika.pipes.fetcher.s3.S3Fetcher">
- <name>s3</name>
- <region>us-east-1</region>
- <profile><!-- fill in here --></profile>
- </fetcher>
- </fetchers>
- <pipesIterator class="org.apache.tika.pipes.pipesiterator.fs.FileSystemPipesIterator">
- <fetcherName>fs</fetcherName>
- <basePath>basePath</basePath>
- </pipesIterator>
-</properties>
\ No newline at end of file
+
+<mock>
+
+ <metadata action="add" name="dc:creator">Nikolai Lobachevsky</metadata>
+ <write element="p">main_content</write>
+ <!-- auto detection wasn't working for some reason; add content-type as
+ is to trigger mock on the embedded -->
+ <embedded filename="embed1.xml" content-type="application/mock+xml">
+ <mock>
+ <metadata action="add" name="dc:creator">embeddedAuthor</metadata>
+ <write element="p">some_embedded_content</write>
+ </mock>
+ </embedded>
+
+</mock>
\ No newline at end of file
diff --git a/tika-pipes/tika-pipes-iterators/pom.xml b/tika-pipes/tika-pipes-iterators/pom.xml
index 1abdb0782..5cb99fbd1 100644
--- a/tika-pipes/tika-pipes-iterators/pom.xml
+++ b/tika-pipes/tika-pipes-iterators/pom.xml
@@ -35,6 +35,7 @@
in tika-core if you want a file system directory crawler -->
<modules>
<module>tika-pipes-iterator-csv</module>
+ <module>tika-pipes-iterator-json</module>
<module>tika-pipes-iterator-jdbc</module>
<module>tika-pipes-iterator-s3</module>
<module>tika-pipes-iterator-kafka</module>
diff --git a/tika-pipes/tika-async-cli/pom.xml b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/pom.xml
similarity index 78%
copy from tika-pipes/tika-async-cli/pom.xml
copy to tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/pom.xml
index db2966136..7b3307f5e 100644
--- a/tika-pipes/tika-async-cli/pom.xml
+++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/pom.xml
@@ -20,15 +20,15 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<groupId>org.apache.tika</groupId>
- <artifactId>tika-pipes</artifactId>
+ <artifactId>tika-pipes-iterators</artifactId>
<version>3.0.0-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>
<modelVersion>4.0.0</modelVersion>
- <artifactId>tika-async-cli</artifactId>
+ <artifactId>tika-pipes-iterator-json</artifactId>
- <name>Apache Tika Async CLI</name>
+ <name>Apache Tika Pipes Iterator - json</name>
<url>https://tika.apache.org/</url>
<dependencies>
@@ -36,38 +36,34 @@
<groupId>${project.groupId}</groupId>
<artifactId>tika-core</artifactId>
<version>${project.version}</version>
- </dependency>
- <!-- logging -->
- <dependency>
- <groupId>org.apache.logging.log4j</groupId>
- <artifactId>log4j-core</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.logging.log4j</groupId>
- <artifactId>log4j-slf4j2-impl</artifactId>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-emitter-fs</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
+ <scope>provided</scope>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>tika-serialization</artifactId>
<version>${project.version}</version>
- <scope>test</scope>
+ <scope>provided</scope>
</dependency>
</dependencies>
<build>
<plugins>
+ <plugin>
+ <groupId>org.apache.rat</groupId>
+ <artifactId>apache-rat-plugin</artifactId>
+ <version>${rat.version}</version>
+ <configuration>
+ <excludes>
+ <exclude>src/test/resources/test-simple.csv</exclude>
+ </excludes>
+ </configuration>
+ </plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<configuration>
<archive>
<manifestEntries>
- <Automatic-Module-Name>org.apache.tika.pipes.reporters.fs.status</Automatic-Module-Name>
+ <Automatic-Module-Name>org.apache.tika.pipes.pipesiterator.csv</Automatic-Module-Name>
</manifestEntries>
</archive>
</configuration>
@@ -104,12 +100,6 @@
</filter>
</filters>
<transformers>
- <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
- <mainClass>org.apache.tika.async.cli.TikaAsyncCLI</mainClass>
- <manifestEntries>
- <Multi-Release>true</Multi-Release>
- </manifestEntries>
- </transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
<resource>META-INF/LICENSE</resource>
<file>target/classes/META-INF/LICENSE</file>
@@ -127,6 +117,7 @@
</execution>
</executions>
</plugin>
+
</plugins>
</build>
diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIterator.java b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIterator.java
new file mode 100644
index 000000000..4ff338736
--- /dev/null
+++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIterator.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.pipesiterator.json;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.concurrent.TimeoutException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.config.Initializable;
+import org.apache.tika.metadata.serialization.JsonFetchEmitTuple;
+import org.apache.tika.pipes.FetchEmitTuple;
+import org.apache.tika.pipes.pipesiterator.PipesIterator;
+
+/**
+ * Iterates through a UTF-8 text file with one FetchEmitTuple
+ * json object per line.
+ */
+public class JsonPipesIterator extends PipesIterator implements Initializable {
+
+ private static final Logger LOGGER = LoggerFactory.getLogger(JsonPipesIterator.class);
+
+ private Path jsonPath;
+
+ @Override
+ protected void enqueue() throws InterruptedException, IOException, TimeoutException {
+ try (BufferedReader reader = Files.newBufferedReader(jsonPath, StandardCharsets.UTF_8)) {
+ String line = reader.readLine();
+ while (line != null) {
+ try (Reader r = new StringReader(line)) {
+ FetchEmitTuple t = JsonFetchEmitTuple.fromJson(r);
+ LOGGER.info("from json: " + t);
+ tryToAdd(t);
+ line = reader.readLine();
+ }
+ }
+ }
+ }
+
+ public void setJsonPath(String jsonPath) {
+ this.jsonPath = Paths.get(jsonPath);
+ }
+}
diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/test/java/org/apache/tika/pipes/pipesiterator/json/TestJsonPipesIterator.java b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/test/java/org/apache/tika/pipes/pipesiterator/json/TestJsonPipesIterator.java
new file mode 100644
index 000000000..671fecc5f
--- /dev/null
+++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/test/java/org/apache/tika/pipes/pipesiterator/json/TestJsonPipesIterator.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.pipes.pipesiterator.json;
+
+import java.nio.file.Paths;
+import java.util.Iterator;
+
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.pipes.FetchEmitTuple;
+
+@Disabled("until we can write actual tests")
+public class TestJsonPipesIterator {
+
+ @Test
+ public void testBasic() throws Exception {
+ JsonPipesIterator pipesIterator = new JsonPipesIterator();
+ pipesIterator.setJsonPath(
+ Paths.get(this.getClass().getResource("/test-documents/test.json").toURI())
+ .toAbsolutePath().toString());
+ Iterator<FetchEmitTuple> it = pipesIterator.iterator();
+ while (it.hasNext()) {
+ //System.out.println(it.next());
+ }
+ }
+
+ @Test
+ public void testWithEmbDocBytes() throws Exception {
+ JsonPipesIterator pipesIterator = new JsonPipesIterator();
+ pipesIterator.setJsonPath(
+ Paths.get(
+ this.getClass().getResource("/test-documents/test-with-embedded-bytes.json").toURI())
+ .toAbsolutePath().toString());
+ Iterator<FetchEmitTuple> it = pipesIterator.iterator();
+ while (it.hasNext()) {
+ //System.out.println(it.next());
+ }
+ }
+
+
+ /*
+ //use this to generate test files
+ public static void main(String[] args) throws Exception {
+ Path p = Paths.get("/home/tallison/Intellij/tika-main/tika-pipes/tika-pipes-iterators" +
+ "/tika-pipes-iterator-json/src/test/resources/test-documents/test-with-embedded" +
+ "-bytes.json");
+ try (BufferedWriter writer = Files.newBufferedWriter(p, StandardCharsets.UTF_8)) {
+ HandlerConfig handlerConfig =
+ new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+ HandlerConfig.PARSE_MODE.RMETA, -1, -1,
+ false);
+ EmbeddedDocumentBytesConfig config = new EmbeddedDocumentBytesConfig(true);
+ for (int i = 0; i < 100; i++) {
+ String id = "myid-"+i;
+ FetchEmitTuple t = new FetchEmitTuple(
+ id,
+ new FetchKey("fs", i + ".xml"),
+ new EmitKey("fs", i + ".xml.json"),
+ new Metadata(),
+ handlerConfig,
+ FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT,
+ config);
+ String line = JsonFetchEmitTuple.toJson(t);
+ writer.write(line);
+ writer.newLine();
+ }
+ }
+ }*/
+}
diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/test/resources/test-documents/test-with-embedded-bytes.json b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/test/resources/test-documents/test-with-embedded-bytes.json
new file mode 100644
index 000000000..5e064d2d7
--- /dev/null
+++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/test/resources/test-documents/test-with-embedded-bytes.json
@@ -0,0 +1,100 @@
+{"id":"myid-0","fetcher":"fs","fetchKey":"0.xml","emitter":"fs","emitKey":"0.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-1","fetcher":"fs","fetchKey":"1.xml","emitter":"fs","emitKey":"1.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-2","fetcher":"fs","fetchKey":"2.xml","emitter":"fs","emitKey":"2.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-3","fetcher":"fs","fetchKey":"3.xml","emitter":"fs","emitKey":"3.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-4","fetcher":"fs","fetchKey":"4.xml","emitter":"fs","emitKey":"4.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-5","fetcher":"fs","fetchKey":"5.xml","emitter":"fs","emitKey":"5.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-6","fetcher":"fs","fetchKey":"6.xml","emitter":"fs","emitKey":"6.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-7","fetcher":"fs","fetchKey":"7.xml","emitter":"fs","emitKey":"7.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-8","fetcher":"fs","fetchKey":"8.xml","emitter":"fs","emitKey":"8.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-9","fetcher":"fs","fetchKey":"9.xml","emitter":"fs","emitKey":"9.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-10","fetcher":"fs","fetchKey":"10.xml","emitter":"fs","emitKey":"10.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-11","fetcher":"fs","fetchKey":"11.xml","emitter":"fs","emitKey":"11.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-12","fetcher":"fs","fetchKey":"12.xml","emitter":"fs","emitKey":"12.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-13","fetcher":"fs","fetchKey":"13.xml","emitter":"fs","emitKey":"13.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-14","fetcher":"fs","fetchKey":"14.xml","emitter":"fs","emitKey":"14.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-15","fetcher":"fs","fetchKey":"15.xml","emitter":"fs","emitKey":"15.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-16","fetcher":"fs","fetchKey":"16.xml","emitter":"fs","emitKey":"16.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-17","fetcher":"fs","fetchKey":"17.xml","emitter":"fs","emitKey":"17.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-18","fetcher":"fs","fetchKey":"18.xml","emitter":"fs","emitKey":"18.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-19","fetcher":"fs","fetchKey":"19.xml","emitter":"fs","emitKey":"19.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-20","fetcher":"fs","fetchKey":"20.xml","emitter":"fs","emitKey":"20.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-21","fetcher":"fs","fetchKey":"21.xml","emitter":"fs","emitKey":"21.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-22","fetcher":"fs","fetchKey":"22.xml","emitter":"fs","emitKey":"22.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-23","fetcher":"fs","fetchKey":"23.xml","emitter":"fs","emitKey":"23.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-24","fetcher":"fs","fetchKey":"24.xml","emitter":"fs","emitKey":"24.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-25","fetcher":"fs","fetchKey":"25.xml","emitter":"fs","emitKey":"25.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-26","fetcher":"fs","fetchKey":"26.xml","emitter":"fs","emitKey":"26.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-27","fetcher":"fs","fetchKey":"27.xml","emitter":"fs","emitKey":"27.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-28","fetcher":"fs","fetchKey":"28.xml","emitter":"fs","emitKey":"28.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-29","fetcher":"fs","fetchKey":"29.xml","emitter":"fs","emitKey":"29.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-30","fetcher":"fs","fetchKey":"30.xml","emitter":"fs","emitKey":"30.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-31","fetcher":"fs","fetchKey":"31.xml","emitter":"fs","emitKey":"31.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-32","fetcher":"fs","fetchKey":"32.xml","emitter":"fs","emitKey":"32.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-33","fetcher":"fs","fetchKey":"33.xml","emitter":"fs","emitKey":"33.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-34","fetcher":"fs","fetchKey":"34.xml","emitter":"fs","emitKey":"34.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-35","fetcher":"fs","fetchKey":"35.xml","emitter":"fs","emitKey":"35.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-36","fetcher":"fs","fetchKey":"36.xml","emitter":"fs","emitKey":"36.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-37","fetcher":"fs","fetchKey":"37.xml","emitter":"fs","emitKey":"37.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-38","fetcher":"fs","fetchKey":"38.xml","emitter":"fs","emitKey":"38.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-39","fetcher":"fs","fetchKey":"39.xml","emitter":"fs","emitKey":"39.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-40","fetcher":"fs","fetchKey":"40.xml","emitter":"fs","emitKey":"40.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-41","fetcher":"fs","fetchKey":"41.xml","emitter":"fs","emitKey":"41.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-42","fetcher":"fs","fetchKey":"42.xml","emitter":"fs","emitKey":"42.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-43","fetcher":"fs","fetchKey":"43.xml","emitter":"fs","emitKey":"43.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-44","fetcher":"fs","fetchKey":"44.xml","emitter":"fs","emitKey":"44.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-45","fetcher":"fs","fetchKey":"45.xml","emitter":"fs","emitKey":"45.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-46","fetcher":"fs","fetchKey":"46.xml","emitter":"fs","emitKey":"46.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-47","fetcher":"fs","fetchKey":"47.xml","emitter":"fs","emitKey":"47.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-48","fetcher":"fs","fetchKey":"48.xml","emitter":"fs","emitKey":"48.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-49","fetcher":"fs","fetchKey":"49.xml","emitter":"fs","emitKey":"49.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-50","fetcher":"fs","fetchKey":"50.xml","emitter":"fs","emitKey":"50.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-51","fetcher":"fs","fetchKey":"51.xml","emitter":"fs","emitKey":"51.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-52","fetcher":"fs","fetchKey":"52.xml","emitter":"fs","emitKey":"52.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-53","fetcher":"fs","fetchKey":"53.xml","emitter":"fs","emitKey":"53.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-54","fetcher":"fs","fetchKey":"54.xml","emitter":"fs","emitKey":"54.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-55","fetcher":"fs","fetchKey":"55.xml","emitter":"fs","emitKey":"55.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-56","fetcher":"fs","fetchKey":"56.xml","emitter":"fs","emitKey":"56.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-57","fetcher":"fs","fetchKey":"57.xml","emitter":"fs","emitKey":"57.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-58","fetcher":"fs","fetchKey":"58.xml","emitter":"fs","emitKey":"58.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-59","fetcher":"fs","fetchKey":"59.xml","emitter":"fs","emitKey":"59.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-60","fetcher":"fs","fetchKey":"60.xml","emitter":"fs","emitKey":"60.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-61","fetcher":"fs","fetchKey":"61.xml","emitter":"fs","emitKey":"61.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-62","fetcher":"fs","fetchKey":"62.xml","emitter":"fs","emitKey":"62.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-63","fetcher":"fs","fetchKey":"63.xml","emitter":"fs","emitKey":"63.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-64","fetcher":"fs","fetchKey":"64.xml","emitter":"fs","emitKey":"64.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-65","fetcher":"fs","fetchKey":"65.xml","emitter":"fs","emitKey":"65.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-66","fetcher":"fs","fetchKey":"66.xml","emitter":"fs","emitKey":"66.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-67","fetcher":"fs","fetchKey":"67.xml","emitter":"fs","emitKey":"67.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-68","fetcher":"fs","fetchKey":"68.xml","emitter":"fs","emitKey":"68.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-69","fetcher":"fs","fetchKey":"69.xml","emitter":"fs","emitKey":"69.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-70","fetcher":"fs","fetchKey":"70.xml","emitter":"fs","emitKey":"70.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-71","fetcher":"fs","fetchKey":"71.xml","emitter":"fs","emitKey":"71.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-72","fetcher":"fs","fetchKey":"72.xml","emitter":"fs","emitKey":"72.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-73","fetcher":"fs","fetchKey":"73.xml","emitter":"fs","emitKey":"73.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-74","fetcher":"fs","fetchKey":"74.xml","emitter":"fs","emitKey":"74.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-75","fetcher":"fs","fetchKey":"75.xml","emitter":"fs","emitKey":"75.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-76","fetcher":"fs","fetchKey":"76.xml","emitter":"fs","emitKey":"76.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-77","fetcher":"fs","fetchKey":"77.xml","emitter":"fs","emitKey":"77.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-78","fetcher":"fs","fetchKey":"78.xml","emitter":"fs","emitKey":"78.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-79","fetcher":"fs","fetchKey":"79.xml","emitter":"fs","emitKey":"79.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-80","fetcher":"fs","fetchKey":"80.xml","emitter":"fs","emitKey":"80.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-81","fetcher":"fs","fetchKey":"81.xml","emitter":"fs","emitKey":"81.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-82","fetcher":"fs","fetchKey":"82.xml","emitter":"fs","emitKey":"82.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-83","fetcher":"fs","fetchKey":"83.xml","emitter":"fs","emitKey":"83.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-84","fetcher":"fs","fetchKey":"84.xml","emitter":"fs","emitKey":"84.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-85","fetcher":"fs","fetchKey":"85.xml","emitter":"fs","emitKey":"85.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-86","fetcher":"fs","fetchKey":"86.xml","emitter":"fs","emitKey":"86.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-87","fetcher":"fs","fetchKey":"87.xml","emitter":"fs","emitKey":"87.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-88","fetcher":"fs","fetchKey":"88.xml","emitter":"fs","emitKey":"88.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-89","fetcher":"fs","fetchKey":"89.xml","emitter":"fs","emitKey":"89.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-90","fetcher":"fs","fetchKey":"90.xml","emitter":"fs","emitKey":"90.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-91","fetcher":"fs","fetchKey":"91.xml","emitter":"fs","emitKey":"91.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-92","fetcher":"fs","fetchKey":"92.xml","emitter":"fs","emitKey":"92.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-93","fetcher":"fs","fetchKey":"93.xml","emitter":"fs","emitKey":"93.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-94","fetcher":"fs","fetchKey":"94.xml","emitter":"fs","emitKey":"94.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-95","fetcher":"fs","fetchKey":"95.xml","emitter":"fs","emitKey":"95.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-96","fetcher":"fs","fetchKey":"96.xml","emitter":"fs","emitKey":"96.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-97","fetcher":"fs","fetchKey":"97.xml","emitter":"fs","emitKey":"97.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-98","fetcher":"fs","fetchKey":"98.xml","emitter":"fs","emitKey":"98.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
+{"id":"myid-99","fetcher":"fs","fetchKey":"99.xml","emitter":"fs","emitKey":"99.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit","embeddedDocumentBytesConfig":{"extractEmbeddedDocumentBytes":true,"zeroPadName":0,"suffixStrategy":"NONE","embeddedIdPrefix":"-","includeOriginal":false}}
diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/test/resources/test-documents/test.json b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/test/resources/test-documents/test.json
new file mode 100644
index 000000000..199772ecb
--- /dev/null
+++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/test/resources/test-documents/test.json
@@ -0,0 +1,100 @@
+{"id":"myid-0","fetcher":"fs","fetchKey":"0.xml","emitter":"fs","emitKey":"0.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-1","fetcher":"fs","fetchKey":"1.xml","emitter":"fs","emitKey":"1.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-2","fetcher":"fs","fetchKey":"2.xml","emitter":"fs","emitKey":"2.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-3","fetcher":"fs","fetchKey":"3.xml","emitter":"fs","emitKey":"3.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-4","fetcher":"fs","fetchKey":"4.xml","emitter":"fs","emitKey":"4.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-5","fetcher":"fs","fetchKey":"5.xml","emitter":"fs","emitKey":"5.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-6","fetcher":"fs","fetchKey":"6.xml","emitter":"fs","emitKey":"6.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-7","fetcher":"fs","fetchKey":"7.xml","emitter":"fs","emitKey":"7.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-8","fetcher":"fs","fetchKey":"8.xml","emitter":"fs","emitKey":"8.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-9","fetcher":"fs","fetchKey":"9.xml","emitter":"fs","emitKey":"9.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-10","fetcher":"fs","fetchKey":"10.xml","emitter":"fs","emitKey":"10.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-11","fetcher":"fs","fetchKey":"11.xml","emitter":"fs","emitKey":"11.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-12","fetcher":"fs","fetchKey":"12.xml","emitter":"fs","emitKey":"12.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-13","fetcher":"fs","fetchKey":"13.xml","emitter":"fs","emitKey":"13.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-14","fetcher":"fs","fetchKey":"14.xml","emitter":"fs","emitKey":"14.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-15","fetcher":"fs","fetchKey":"15.xml","emitter":"fs","emitKey":"15.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-16","fetcher":"fs","fetchKey":"16.xml","emitter":"fs","emitKey":"16.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-17","fetcher":"fs","fetchKey":"17.xml","emitter":"fs","emitKey":"17.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-18","fetcher":"fs","fetchKey":"18.xml","emitter":"fs","emitKey":"18.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-19","fetcher":"fs","fetchKey":"19.xml","emitter":"fs","emitKey":"19.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-20","fetcher":"fs","fetchKey":"20.xml","emitter":"fs","emitKey":"20.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-21","fetcher":"fs","fetchKey":"21.xml","emitter":"fs","emitKey":"21.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-22","fetcher":"fs","fetchKey":"22.xml","emitter":"fs","emitKey":"22.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-23","fetcher":"fs","fetchKey":"23.xml","emitter":"fs","emitKey":"23.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-24","fetcher":"fs","fetchKey":"24.xml","emitter":"fs","emitKey":"24.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-25","fetcher":"fs","fetchKey":"25.xml","emitter":"fs","emitKey":"25.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-26","fetcher":"fs","fetchKey":"26.xml","emitter":"fs","emitKey":"26.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-27","fetcher":"fs","fetchKey":"27.xml","emitter":"fs","emitKey":"27.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-28","fetcher":"fs","fetchKey":"28.xml","emitter":"fs","emitKey":"28.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-29","fetcher":"fs","fetchKey":"29.xml","emitter":"fs","emitKey":"29.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-30","fetcher":"fs","fetchKey":"30.xml","emitter":"fs","emitKey":"30.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-31","fetcher":"fs","fetchKey":"31.xml","emitter":"fs","emitKey":"31.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-32","fetcher":"fs","fetchKey":"32.xml","emitter":"fs","emitKey":"32.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-33","fetcher":"fs","fetchKey":"33.xml","emitter":"fs","emitKey":"33.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-34","fetcher":"fs","fetchKey":"34.xml","emitter":"fs","emitKey":"34.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-35","fetcher":"fs","fetchKey":"35.xml","emitter":"fs","emitKey":"35.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-36","fetcher":"fs","fetchKey":"36.xml","emitter":"fs","emitKey":"36.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-37","fetcher":"fs","fetchKey":"37.xml","emitter":"fs","emitKey":"37.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-38","fetcher":"fs","fetchKey":"38.xml","emitter":"fs","emitKey":"38.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-39","fetcher":"fs","fetchKey":"39.xml","emitter":"fs","emitKey":"39.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-40","fetcher":"fs","fetchKey":"40.xml","emitter":"fs","emitKey":"40.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-41","fetcher":"fs","fetchKey":"41.xml","emitter":"fs","emitKey":"41.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-42","fetcher":"fs","fetchKey":"42.xml","emitter":"fs","emitKey":"42.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-43","fetcher":"fs","fetchKey":"43.xml","emitter":"fs","emitKey":"43.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-44","fetcher":"fs","fetchKey":"44.xml","emitter":"fs","emitKey":"44.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-45","fetcher":"fs","fetchKey":"45.xml","emitter":"fs","emitKey":"45.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-46","fetcher":"fs","fetchKey":"46.xml","emitter":"fs","emitKey":"46.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-47","fetcher":"fs","fetchKey":"47.xml","emitter":"fs","emitKey":"47.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-48","fetcher":"fs","fetchKey":"48.xml","emitter":"fs","emitKey":"48.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-49","fetcher":"fs","fetchKey":"49.xml","emitter":"fs","emitKey":"49.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-50","fetcher":"fs","fetchKey":"50.xml","emitter":"fs","emitKey":"50.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-51","fetcher":"fs","fetchKey":"51.xml","emitter":"fs","emitKey":"51.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-52","fetcher":"fs","fetchKey":"52.xml","emitter":"fs","emitKey":"52.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-53","fetcher":"fs","fetchKey":"53.xml","emitter":"fs","emitKey":"53.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-54","fetcher":"fs","fetchKey":"54.xml","emitter":"fs","emitKey":"54.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-55","fetcher":"fs","fetchKey":"55.xml","emitter":"fs","emitKey":"55.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-56","fetcher":"fs","fetchKey":"56.xml","emitter":"fs","emitKey":"56.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-57","fetcher":"fs","fetchKey":"57.xml","emitter":"fs","emitKey":"57.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-58","fetcher":"fs","fetchKey":"58.xml","emitter":"fs","emitKey":"58.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-59","fetcher":"fs","fetchKey":"59.xml","emitter":"fs","emitKey":"59.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-60","fetcher":"fs","fetchKey":"60.xml","emitter":"fs","emitKey":"60.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-61","fetcher":"fs","fetchKey":"61.xml","emitter":"fs","emitKey":"61.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-62","fetcher":"fs","fetchKey":"62.xml","emitter":"fs","emitKey":"62.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-63","fetcher":"fs","fetchKey":"63.xml","emitter":"fs","emitKey":"63.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-64","fetcher":"fs","fetchKey":"64.xml","emitter":"fs","emitKey":"64.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-65","fetcher":"fs","fetchKey":"65.xml","emitter":"fs","emitKey":"65.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-66","fetcher":"fs","fetchKey":"66.xml","emitter":"fs","emitKey":"66.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-67","fetcher":"fs","fetchKey":"67.xml","emitter":"fs","emitKey":"67.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-68","fetcher":"fs","fetchKey":"68.xml","emitter":"fs","emitKey":"68.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-69","fetcher":"fs","fetchKey":"69.xml","emitter":"fs","emitKey":"69.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-70","fetcher":"fs","fetchKey":"70.xml","emitter":"fs","emitKey":"70.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-71","fetcher":"fs","fetchKey":"71.xml","emitter":"fs","emitKey":"71.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-72","fetcher":"fs","fetchKey":"72.xml","emitter":"fs","emitKey":"72.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-73","fetcher":"fs","fetchKey":"73.xml","emitter":"fs","emitKey":"73.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-74","fetcher":"fs","fetchKey":"74.xml","emitter":"fs","emitKey":"74.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-75","fetcher":"fs","fetchKey":"75.xml","emitter":"fs","emitKey":"75.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-76","fetcher":"fs","fetchKey":"76.xml","emitter":"fs","emitKey":"76.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-77","fetcher":"fs","fetchKey":"77.xml","emitter":"fs","emitKey":"77.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-78","fetcher":"fs","fetchKey":"78.xml","emitter":"fs","emitKey":"78.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-79","fetcher":"fs","fetchKey":"79.xml","emitter":"fs","emitKey":"79.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-80","fetcher":"fs","fetchKey":"80.xml","emitter":"fs","emitKey":"80.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-81","fetcher":"fs","fetchKey":"81.xml","emitter":"fs","emitKey":"81.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-82","fetcher":"fs","fetchKey":"82.xml","emitter":"fs","emitKey":"82.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-83","fetcher":"fs","fetchKey":"83.xml","emitter":"fs","emitKey":"83.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-84","fetcher":"fs","fetchKey":"84.xml","emitter":"fs","emitKey":"84.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-85","fetcher":"fs","fetchKey":"85.xml","emitter":"fs","emitKey":"85.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-86","fetcher":"fs","fetchKey":"86.xml","emitter":"fs","emitKey":"86.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-87","fetcher":"fs","fetchKey":"87.xml","emitter":"fs","emitKey":"87.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-88","fetcher":"fs","fetchKey":"88.xml","emitter":"fs","emitKey":"88.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-89","fetcher":"fs","fetchKey":"89.xml","emitter":"fs","emitKey":"89.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-90","fetcher":"fs","fetchKey":"90.xml","emitter":"fs","emitKey":"90.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-91","fetcher":"fs","fetchKey":"91.xml","emitter":"fs","emitKey":"91.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-92","fetcher":"fs","fetchKey":"92.xml","emitter":"fs","emitKey":"92.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-93","fetcher":"fs","fetchKey":"93.xml","emitter":"fs","emitKey":"93.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-94","fetcher":"fs","fetchKey":"94.xml","emitter":"fs","emitKey":"94.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-95","fetcher":"fs","fetchKey":"95.xml","emitter":"fs","emitKey":"95.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-96","fetcher":"fs","fetchKey":"96.xml","emitter":"fs","emitKey":"96.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-97","fetcher":"fs","fetchKey":"97.xml","emitter":"fs","emitKey":"97.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-98","fetcher":"fs","fetchKey":"98.xml","emitter":"fs","emitKey":"98.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
+{"id":"myid-99","fetcher":"fs","fetchKey":"99.xml","emitter":"fs","emitKey":"99.xml.json","handlerConfig":{"type":"text","parseMode":"rmeta","writeLimit":-1,"maxEmbeddedResources":-1},"onParseException":"emit"}
diff --git a/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonFetchEmitTuple.java b/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonFetchEmitTuple.java
index e1bec421a..ed5931932 100644
--- a/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonFetchEmitTuple.java
+++ b/tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonFetchEmitTuple.java
@@ -32,8 +32,8 @@ import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.pipes.FetchEmitTuple;
import org.apache.tika.pipes.HandlerConfig;
-import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig;
import org.apache.tika.pipes.emitter.EmitKey;
+import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig;
import org.apache.tika.pipes.fetcher.FetchKey;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.utils.StringUtils;
@@ -56,6 +56,11 @@ public class JsonFetchEmitTuple {
private static final String HANDLER_CONFIG_PARSE_MODE = "parseMode";
private static final String EMBEDDED_DOCUMENT_BYTES_CONFIG = "embeddedDocumentBytesConfig";
+ private static final String ZERO_PAD_NAME = "zeroPadName";
+ private static final String EXTRACT_EMBEDDED_DOCUMENT_BYTES = "extractEmbeddedDocumentBytes";
+ private static final String SUFFIX_STRATEGY = "suffixStrategy";
+ private static final String EMBEDDED_ID_PREFIX = "embeddedIdPrefix";
+ private static final String INCLUDE_ORIGINAL = "includeOriginal";
public static FetchEmitTuple fromJson(Reader reader) throws IOException {
@@ -147,26 +152,35 @@ public class JsonFetchEmitTuple {
EmbeddedDocumentBytesConfig config = new EmbeddedDocumentBytesConfig(true);
while (fieldName != null) {
switch (fieldName) {
- //TODO: fill in more here!
- case "extractEmbeddedDocumentBytes":
+ case EXTRACT_EMBEDDED_DOCUMENT_BYTES:
boolean extract = jParser.nextBooleanValue();
if (! extract) {
return new EmbeddedDocumentBytesConfig(false);
}
break;
- case "includeOriginal":
+ case INCLUDE_ORIGINAL:
config.setIncludeOriginal(jParser.nextBooleanValue());
break;
- case "emitter":
+ case EMITTER:
config.setEmitter(jParser.nextTextValue());
break;
+ case ZERO_PAD_NAME:
+ config.setZeroPadNameLength(jParser.nextIntValue(0));
+ break;
+ case SUFFIX_STRATEGY:
+ config.setSuffixStrategy(EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.parse(
+ jParser.nextTextValue()));
+ break;
+ case EMBEDDED_ID_PREFIX:
+ config.setEmbeddedIdPrefix(jParser.nextTextValue());
+ break;
default:
throw new IllegalArgumentException("I regret I don't understand '" + fieldName +
"' in the context of an embeddedDocumentBytesConfig");
}
fieldName = jParser.nextFieldName();
}
- return EmbeddedDocumentBytesConfig.SKIP;
+ return config;
}
private static HandlerConfig getHandlerConfig(JsonParser jParser) throws IOException {
@@ -270,6 +284,22 @@ public class JsonFetchEmitTuple {
}
jsonGenerator.writeStringField(ON_PARSE_EXCEPTION,
t.getOnParseException().name().toLowerCase(Locale.US));
+ if (t.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes()) {
+ EmbeddedDocumentBytesConfig edbc = t.getEmbeddedDocumentBytesConfig();
+ jsonGenerator.writeFieldName(EMBEDDED_DOCUMENT_BYTES_CONFIG);
+ jsonGenerator.writeStartObject();
+ jsonGenerator.writeBooleanField(EXTRACT_EMBEDDED_DOCUMENT_BYTES,
+ edbc.isExtractEmbeddedDocumentBytes());
+ jsonGenerator.writeNumberField(ZERO_PAD_NAME, edbc.getZeroPadName());
+ jsonGenerator.writeStringField(SUFFIX_STRATEGY,
+ edbc.getSuffixStrategy().toString());
+ jsonGenerator.writeStringField(EMBEDDED_ID_PREFIX, edbc.getEmbeddedIdPrefix());
+ if (! StringUtils.isBlank(edbc.getEmitter())) {
+ jsonGenerator.writeStringField(EMITTER, edbc.getEmitter());
+ }
+ jsonGenerator.writeBooleanField(INCLUDE_ORIGINAL, edbc.isIncludeOriginal());
+ jsonGenerator.writeEndObject();
+ }
jsonGenerator.writeEndObject();
}
diff --git a/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonFetchEmitTupleTest.java b/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonFetchEmitTupleTest.java
index aeb4fefd4..4484478dc 100644
--- a/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonFetchEmitTupleTest.java
+++ b/tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonFetchEmitTupleTest.java
@@ -28,6 +28,7 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.pipes.FetchEmitTuple;
import org.apache.tika.pipes.HandlerConfig;
import org.apache.tika.pipes.emitter.EmitKey;
+import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig;
import org.apache.tika.pipes.fetcher.FetchKey;
import org.apache.tika.sax.BasicContentHandlerFactory;
@@ -77,4 +78,23 @@ public class JsonFetchEmitTupleTest {
FetchEmitTuple deserialized = JsonFetchEmitTuple.fromJson(reader);
assertEquals(t, deserialized);
}
+
+ @Test
+ public void testBytes() throws Exception {
+ EmbeddedDocumentBytesConfig bytesConfig = new EmbeddedDocumentBytesConfig(true);
+ bytesConfig.setEmitter("emitter");
+ FetchEmitTuple t = new FetchEmitTuple("my_id",
+ new FetchKey("my_fetcher", "fetchKey1", 10, 1000),
+ new EmitKey("my_emitter", "emitKey1"), new Metadata(),
+ new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.XML,
+ HandlerConfig.PARSE_MODE.CONCATENATE,
+ 10000,10, true),
+ FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP, bytesConfig);
+ StringWriter writer = new StringWriter();
+ JsonFetchEmitTuple.toJson(t, writer);
+ Reader reader = new StringReader(writer.toString());
+ FetchEmitTuple deserialized = JsonFetchEmitTuple.fromJson(reader);
+ assertEquals(t, deserialized);
+
+ }
}